2136 files changed, 153517 insertions, 24042 deletions
diff --git a/test/ARCMT/with space/test.h b/test/ARCMT/Inputs/with space/test.h
index 756295f27e698..756295f27e698 100644
--- a/test/ARCMT/with space/test.h
+++ b/test/ARCMT/Inputs/with space/test.h
diff --git a/test/ARCMT/with space/test.h.result b/test/ARCMT/Inputs/with space/test.h.result
index 0638a3378c1c0..0638a3378c1c0 100644
--- a/test/ARCMT/with space/test.h.result
+++ b/test/ARCMT/Inputs/with space/test.h.result
diff --git a/test/ARCMT/with space/test1.m.in b/test/ARCMT/Inputs/with space/test1.m.in
index 8416a88965696..8416a88965696 100644
--- a/test/ARCMT/with space/test1.m.in
+++ b/test/ARCMT/Inputs/with space/test1.m.in
diff --git a/test/ARCMT/with space/test1.m.in.result b/test/ARCMT/Inputs/with space/test1.m.in.result
index f351fe6c83552..f351fe6c83552 100644
--- a/test/ARCMT/with space/test1.m.in.result
+++ b/test/ARCMT/Inputs/with space/test1.m.in.result
diff --git a/test/ARCMT/with space/test2.m.in b/test/ARCMT/Inputs/with space/test2.m.in
index 99f87b0721716..99f87b0721716 100644
--- a/test/ARCMT/with space/test2.m.in
+++ b/test/ARCMT/Inputs/with space/test2.m.in
diff --git a/test/ARCMT/with space/test2.m.in.result b/test/ARCMT/Inputs/with space/test2.m.in.result
index f8e918ce2598e..f8e918ce2598e 100644
--- a/test/ARCMT/with space/test2.m.in.result
+++ b/test/ARCMT/Inputs/with space/test2.m.in.result
diff --git a/test/ARCMT/migrate-space-in-path.m b/test/ARCMT/migrate-space-in-path.m
index a797e6d1f482f..d060485ee20e2 100644
--- a/test/ARCMT/migrate-space-in-path.m
+++ b/test/ARCMT/migrate-space-in-path.m
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t.migrate
-// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/"with space"/test1.m.in -x objective-c 
-// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/"with space"/test2.m.in -x objective-c 
-// RUN: c-arcmt-test -mt-migrate-directory %t.migrate | arcmt-test -verify-transformed-files %S/"with space"/test1.m.in.result %S/"with space"/test2.m.in.result %S/"with space"/test.h.result
+// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/Inputs/"with space"/test1.m.in -x objective-c
+// RUN: %clang_cc1 -arcmt-migrate -mt-migrate-directory %t.migrate %S/Inputs/"with space"/test2.m.in -x objective-c
+// RUN: c-arcmt-test -mt-migrate-directory %t.migrate | arcmt-test -verify-transformed-files %S/Inputs/"with space"/test1.m.in.result %S/Inputs/"with space"/test2.m.in.result %S/Inputs/"with space"/test.h.result
 // RUN: rm -rf %t.migrate
diff --git a/test/ARCMT/objcmt-invalid-code.mm b/test/ARCMT/objcmt-invalid-code.mm
new file mode 100644
index 0000000000000..f780d3de57ed8
--- /dev/null
+++ b/test/ARCMT/objcmt-invalid-code.mm
@@ -0,0 +1,19 @@
+// REQUIRES: x86-registered-target
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only %s -verify
+// RUN: not %clang_cc1 -triple x86_64-apple-darwin10 -objcmt-migrate-literals -objcmt-migrate-subscripting -mt-migrate-directory %t %s -x objective-c
+// RUN: c-arcmt-test -mt-migrate-directory %t | arcmt-test -verify-transformed-files %s.result
+
+// Make sure there is no crash.
+
+@interface NSObject @end
+@interface NSNumber : NSObject
+@end
+
+@interface NSNumber (NSNumberCreation)
++ (NSNumber *)numberWithInt:(int)value;
+@end
+
+void foo(int x = undeclared) { // expected-error {{undeclared}}
+  NSNumber *n = [NSNumber numberWithInt:1];
+}
diff --git a/test/ARCMT/objcmt-invalid-code.mm.result b/test/ARCMT/objcmt-invalid-code.mm.result
new file mode 100644
index 0000000000000..52b2db00f6109
--- /dev/null
+++ b/test/ARCMT/objcmt-invalid-code.mm.result
@@ -0,0 +1,19 @@
+// REQUIRES: x86-registered-target
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only %s -verify
+// RUN: not %clang_cc1 -triple x86_64-apple-darwin10 -objcmt-migrate-literals -objcmt-migrate-subscripting -mt-migrate-directory %t %s -x objective-c
+// RUN: c-arcmt-test -mt-migrate-directory %t | arcmt-test -verify-transformed-files %s.result
+
+// Make sure there is no crash.
+
+@interface NSObject @end
+@interface NSNumber : NSObject
+@end
+
+@interface NSNumber (NSNumberCreation)
++ (NSNumber *)numberWithInt:(int)value;
+@end
+
+void foo(int x = undeclared) { // expected-error {{undeclared}}
+  NSNumber *n = @1;
+}
diff --git a/test/ARCMT/objcmt-ns-enum-crash.m b/test/ARCMT/objcmt-ns-enum-crash.m
new file mode 100644
index 0000000000000..7c9c708efaefb
--- /dev/null
+++ b/test/ARCMT/objcmt-ns-enum-crash.m
@@ -0,0 +1,14 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -objcmt-migrate-ns-macros -mt-migrate-directory %t %s -x objective-c -fobjc-runtime-has-weak -fobjc-arc -triple x86_64-apple-darwin11
+// RUN: c-arcmt-test -mt-migrate-directory %t | arcmt-test -verify-transformed-files %s.result
+
+#define NS_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
+#define NS_OPTIONS(_type, _name) enum _name : _type _name; enum _name : _type
+typedef long NSInteger;
+
+typedef enum : NSInteger {five} ApplicableEnum;
+
+typedef unsigned long mytd;
+
+#define MY_ENUM(name, type, ...) typedef enum : type { __VA_ARGS__ } name##_t
+MY_ENUM(MyEnum, unsigned int, One);
diff --git a/test/ARCMT/objcmt-ns-enum-crash.m.result b/test/ARCMT/objcmt-ns-enum-crash.m.result
new file mode 100644
index 0000000000000..0a76e66ea2111
--- /dev/null
+++ b/test/ARCMT/objcmt-ns-enum-crash.m.result
@@ -0,0 +1,14 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -objcmt-migrate-ns-macros -mt-migrate-directory %t %s -x objective-c -fobjc-runtime-has-weak -fobjc-arc -triple x86_64-apple-darwin11
+// RUN: c-arcmt-test -mt-migrate-directory %t | arcmt-test -verify-transformed-files %s.result
+
+#define NS_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
+#define NS_OPTIONS(_type, _name) enum _name : _type _name; enum _name : _type
+typedef long NSInteger;
+
+typedef NS_ENUM(NSInteger, ApplicableEnum) {five};
+
+typedef unsigned long mytd;
+
+#define MY_ENUM(name, type, ...) typedef enum : type { __VA_ARGS__ } name##_t
+MY_ENUM(MyEnum, unsigned int, One);
diff --git a/test/ARCMT/objcmt-ns-macros.m.result b/test/ARCMT/objcmt-ns-macros.m.result
index bcc865ce0f667..0107827767361 100644
--- a/test/ARCMT/objcmt-ns-macros.m.result
+++ b/test/ARCMT/objcmt-ns-macros.m.result
@@ -19,9 +19,6 @@ typedef unsigned long long uint64_t;
 #define NS_OPTIONS(_type, _name) enum _name : _type _name; enum _name : _type
 #define DEPRECATED  __attribute__((deprecated))
 
-#ifndef NS_ENUM
-#import <Foundation/Foundation.h>
-#endif
 typedef NS_ENUM(NSInteger, wibble) {
   blah,
   blarg
@@ -80,7 +77,7 @@ typedef NS_ENUM(NSInteger, UIK) {
   UIKTwo = 2,
 };
 
-typedef NS_ENUM(unsigned int, NSTickMarkPosition)  {
+typedef NS_ENUM(unsigned int, NSTickMarkPosition) {
     NSTickMarkBelow = 0,
     NSTickMarkAbove = 1,
     NSTickMarkLeft = NSTickMarkAbove,
diff --git a/test/ARCMT/whitelisted/header1.h b/test/ARCMT/whitelisted/header1.h
index d94b9f7d9ebe6..33f77aa5dd152 100644
--- a/test/ARCMT/whitelisted/header1.h
+++ b/test/ARCMT/whitelisted/header1.h
@@ -4,3 +4,5 @@
 -(void)setProp:(int)p;
 +(id)i1;
 @end
+
+typedef long NSInteger;
diff --git a/test/ARCMT/whitelisted/header1.h.result b/test/ARCMT/whitelisted/header1.h.result
index 65cbd2621f6b6..c7cf109a27e92 100644
--- a/test/ARCMT/whitelisted/header1.h.result
+++ b/test/ARCMT/whitelisted/header1.h.result
@@ -3,3 +3,5 @@
 @property (nonatomic) int prop;
 +(instancetype)i1;
 @end
+
+typedef long NSInteger;
diff --git a/test/ARCMT/whitelisted/header2.h b/test/ARCMT/whitelisted/header2.h
index c7577ede4a490..ac3888ccdf8c1 100644
--- a/test/ARCMT/whitelisted/header2.h
+++ b/test/ARCMT/whitelisted/header2.h
@@ -1,4 +1,7 @@
 
+#define NS_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
+typedef enum : NSInteger {five} ApplicableEnum;
+
 @interface I2 : NSObject
 -(int)prop;
 -(void)setProp:(int)p;
diff --git a/test/ARCMT/whitelisted/header2.h.result b/test/ARCMT/whitelisted/header2.h.result
index b1b52707118e2..3226e711b98ac 100644
--- a/test/ARCMT/whitelisted/header2.h.result
+++ b/test/ARCMT/whitelisted/header2.h.result
@@ -1,4 +1,7 @@
 
+#define NS_ENUM(_type, _name) enum _name : _type _name; enum _name : _type
+typedef NS_ENUM(NSInteger, ApplicableEnum) {five};
+
 @interface I2 : NSObject
 @property (nonatomic) int prop;
 @end
diff --git a/test/ARCMT/whitelisted/objcmt-with-whitelist.m b/test/ARCMT/whitelisted/objcmt-with-whitelist.m
index bef82c8667d9c..0ea714fe59374 100644
--- a/test/ARCMT/whitelisted/objcmt-with-whitelist.m
+++ b/test/ARCMT/whitelisted/objcmt-with-whitelist.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -objcmt-migrate-readwrite-property -objcmt-migrate-instancetype %s -triple x86_64-apple-darwin11 -migrate -o %t.remap
+// RUN: %clang_cc1 -objcmt-migrate-readwrite-property -objcmt-migrate-instancetype -objcmt-migrate-ns-macros %s -triple x86_64-apple-darwin11 -migrate -o %t.remap
 // RUN: c-arcmt-test %t.remap | arcmt-test -verify-transformed-files %S/header1.h.result %S/header2.h.result
-// RUN: %clang_cc1 -objcmt-migrate-readwrite-property -objcmt-migrate-instancetype -objcmt-white-list-dir-path=%S/Inputs %s -triple x86_64-apple-darwin11 -migrate -o %t.remap
+// RUN: %clang_cc1 -objcmt-migrate-readwrite-property -objcmt-migrate-instancetype -objcmt-migrate-ns-macros -objcmt-white-list-dir-path=%S/Inputs %s -triple x86_64-apple-darwin11 -migrate -o %t.remap
 // RUN: c-arcmt-test %t.remap | arcmt-test -verify-transformed-files %S/header1.h.result
 
 @interface NSObject
diff --git a/test/ASTMerge/Inputs/anonymous-fields1.cpp b/test/ASTMerge/Inputs/anonymous-fields1.cpp
new file mode 100644
index 0000000000000..829bc0edd30a6
--- /dev/null
+++ b/test/ASTMerge/Inputs/anonymous-fields1.cpp
@@ -0,0 +1,5 @@
+class A {
+public:
+  struct { int foo; } f;
+  struct { int foo; } g;
+};
diff --git a/test/ASTMerge/Inputs/anonymous-fields2.cpp b/test/ASTMerge/Inputs/anonymous-fields2.cpp
new file mode 100644
index 0000000000000..28ea46d98711b
--- /dev/null
+++ b/test/ASTMerge/Inputs/anonymous-fields2.cpp
@@ -0,0 +1,9 @@
+class A {
+public:
+  struct { int foo; } f;
+  struct { int foo; } g;
+};
+
+inline int useA(A &a) {
+  return (a.f.foo + a.g.foo);
+}
diff --git a/test/ASTMerge/Inputs/class1.cpp b/test/ASTMerge/Inputs/class1.cpp
index 0cd6565f1a928..b0a7645cfe630 100644
--- a/test/ASTMerge/Inputs/class1.cpp
+++ b/test/ASTMerge/Inputs/class1.cpp
@@ -1,5 +1,6 @@
 struct A {
-  int x;
+  public:
+    int x;
 };
 
 struct B : A {
diff --git a/test/ASTMerge/Inputs/class2.cpp b/test/ASTMerge/Inputs/class2.cpp
index 5d5d9ca2333cb..2bed6d775bc46 100644
--- a/test/ASTMerge/Inputs/class2.cpp
+++ b/test/ASTMerge/Inputs/class2.cpp
@@ -1,5 +1,6 @@
 struct A {
-  int x;
+  public:
+    int x;
 };
 
 struct B : A {
diff --git a/test/ASTMerge/Inputs/inheritance-base.cpp b/test/ASTMerge/Inputs/inheritance-base.cpp
new file mode 100644
index 0000000000000..26fe42eb64da3
--- /dev/null
+++ b/test/ASTMerge/Inputs/inheritance-base.cpp
@@ -0,0 +1,7 @@
+class A
+{
+public:
+  int x;
+  A(int _x) : x(_x) {
+  }
+};
diff --git a/test/ASTMerge/Inputs/init-ctors-classes.cpp b/test/ASTMerge/Inputs/init-ctors-classes.cpp
new file mode 100644
index 0000000000000..fd51f860634b5
--- /dev/null
+++ b/test/ASTMerge/Inputs/init-ctors-classes.cpp
@@ -0,0 +1,19 @@
+class A_base
+{
+public:
+  int x;
+  A_base() : x(0) {
+  }
+  A_base(int _x) : x(static_cast<int>(_x)) {
+  }
+};
+
+class A : public A_base
+{
+public:
+  int y;
+  struct { int z; };
+  int array[2];
+  A(int _x) : A_base(_x), y(0), z(1), array{{2},{3}} {
+  }
+};
diff --git a/test/ASTMerge/anonymous-fields.cpp b/test/ASTMerge/anonymous-fields.cpp
new file mode 100644
index 0000000000000..67afc29d07e5b
--- /dev/null
+++ b/test/ASTMerge/anonymous-fields.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -emit-pch -o %t.1.ast %S/Inputs/anonymous-fields1.cpp
+// RUN: %clang_cc1 -emit-pch -o %t.2.ast %S/Inputs/anonymous-fields2.cpp
+// RUN: %clang_cc1 -emit-obj -o /dev/null -ast-merge %t.1.ast -ast-merge %t.2.ast %s
+// expected-no-diagnostics
diff --git a/test/ASTMerge/class.cpp b/test/ASTMerge/class.cpp
index 7b31187c469ea..a68a2d1d7690f 100644
--- a/test/ASTMerge/class.cpp
+++ b/test/ASTMerge/class.cpp
@@ -3,12 +3,12 @@
 // RUN: %clang_cc1 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 -Wno-odr -Werror
 
-// CHECK: class1.cpp:5:8: warning: type 'B' has incompatible definitions in different translation units
-// CHECK: class1.cpp:6:9: note: field 'y' has type 'float' here
-// CHECK: class2.cpp:6:7: note: field 'y' has type 'int' here
+// CHECK: class1.cpp:6:8: warning: type 'B' has incompatible definitions in different translation units
+// CHECK: class1.cpp:7:9: note: field 'y' has type 'float' here
+// CHECK: class2.cpp:7:7: note: field 'y' has type 'int' here
 
 // FIXME: we should also complain about mismatched types on the method
 
-// CHECK: class1.cpp:17:6: warning: type 'E' has incompatible definitions in different translation units
-// CHECK: class1.cpp:18:3: note: enumerator 'b' with value 1 here
-// CHECK: class2.cpp:11:3: note: enumerator 'a' with value 0 here
+// CHECK: class1.cpp:18:6: warning: type 'E' has incompatible definitions in different translation units
+// CHECK: class1.cpp:19:3: note: enumerator 'b' with value 1 here
+// CHECK: class2.cpp:12:3: note: enumerator 'a' with value 0 here
diff --git a/test/ASTMerge/inheritance.cpp b/test/ASTMerge/inheritance.cpp
new file mode 100644
index 0000000000000..7fce82a736ab7
--- /dev/null
+++ b/test/ASTMerge/inheritance.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++1z -emit-pch -o %t.1.ast %S/Inputs/inheritance-base.cpp
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++1z -ast-merge %t.1.ast -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+class B : public A {
+  B(int _a) : A(_a) {
+  }
+};
diff --git a/test/ASTMerge/init-ctors.cpp b/test/ASTMerge/init-ctors.cpp
new file mode 100644
index 0000000000000..5f0ba4decd9ff
--- /dev/null
+++ b/test/ASTMerge/init-ctors.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++1z -emit-pch -o %t.1.ast %S/Inputs/init-ctors-classes.cpp
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++1z -ast-merge %t.1.ast -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+class B {
+  int method_1() {
+    A a(0);
+    return a.x;
+  }
+};
diff --git a/test/Analysis/DeallocMissingRelease.m b/test/Analysis/DeallocMissingRelease.m
new file mode 100644
index 0000000000000..009e80151814e
--- /dev/null
+++ b/test/Analysis/DeallocMissingRelease.m
@@ -0,0 +1,868 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=osx.cocoa.Dealloc -fblocks -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=osx.cocoa.Dealloc -fblocks -triple x86_64-apple-darwin10 -fobjc-arc -fobjc-runtime-has-weak -verify %s
+
+#include "Inputs/system-header-simulator-for-objc-dealloc.h"
+
+#define NON_ARC !__has_feature(objc_arc)
+
+#if NON_ARC
+#define WEAK_ON_ARC
+#else
+#define WEAK_ON_ARC __weak
+#endif
+
+// No diagnostics expected under ARC.
+#if !NON_ARC
+  // expected-no-diagnostics
+#endif
+
+// Do not warn about missing release in -dealloc for ivars.
+
+@interface MyIvarClass1 : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation MyIvarClass1
+- (instancetype)initWithIvar:(NSObject *)ivar
+{
+  self = [super init];
+  if (!self)
+    return nil;
+#if NON_ARC
+  _ivar = [ivar retain];
+#endif
+  return self;
+}
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc];
+#endif
+}
+@end
+
+@interface MyIvarClass2 : NSObject {
+  NSObject *_ivar;
+}
+- (NSObject *)ivar;
+- (void)setIvar:(NSObject *)ivar;
+@end
+
+@implementation MyIvarClass2
+- (instancetype)init
+{
+  self = [super init];
+  return self;
+}
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc];
+#endif
+}
+- (NSObject *)ivar
+{
+  return _ivar;
+}
+- (void)setIvar:(NSObject *)ivar
+{
+#if NON_ARC
+  [_ivar release];
+  _ivar = [ivar retain];
+#else
+ _ivar = ivar;
+#endif
+}
+@end
+
+// Warn about missing release in -dealloc for properties.
+
+@interface MyPropertyClass1 : NSObject
+@property (copy) NSObject *ivar;
+@end
+
+@implementation MyPropertyClass1
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'MyPropertyClass1' was copied by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface MyPropertyClass2 : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyPropertyClass2
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'MyPropertyClass2' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface MyPropertyClass3 : NSObject {
+  NSObject *_ivar;
+}
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyPropertyClass3
+@synthesize ivar = _ivar;
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'MyPropertyClass3' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+
+@end
+
+@interface MyPropertyClass4 : NSObject {
+  void (^_blockPropertyIvar)(void);
+}
+@property (copy) void (^blockProperty)(void);
+@property (copy) void (^blockProperty2)(void);
+@property (copy) void (^blockProperty3)(void);
+
+@end
+
+@implementation MyPropertyClass4
+@synthesize blockProperty = _blockPropertyIvar;
+- (void)dealloc
+{
+#if NON_ARC
+  [_blockProperty2 release];
+  Block_release(_blockProperty3);
+
+  [super dealloc]; // expected-warning {{The '_blockPropertyIvar' ivar in 'MyPropertyClass4' was copied by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface MyPropertyClass5 : NSObject {
+  WEAK_ON_ARC NSObject *_ivar;
+}
+@property (weak) NSObject *ivar;
+@end
+
+@implementation MyPropertyClass5
+@synthesize ivar = _ivar;
+- (void)dealloc
+{
+#if NON_ARC
+  [super dealloc]; // no-warning because it is a weak property
+#endif
+}
+@end
+
+@interface MyPropertyClassWithReturnInDealloc : NSObject {
+  NSObject *_ivar;
+}
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyPropertyClassWithReturnInDealloc
+@synthesize ivar = _ivar;
+- (void)dealloc
+{
+  return;
+#if NON_ARC
+  // expected-warning@-2{{The '_ivar' ivar in 'MyPropertyClassWithReturnInDealloc' was retained by a synthesized property but not released before '[super dealloc]'}}
+  [super dealloc];
+#endif
+}
+@end
+
+@interface MyPropertyClassWithReleaseInOtherInstance : NSObject {
+  NSObject *_ivar;
+  MyPropertyClassWithReleaseInOtherInstance *_other;
+}
+@property (retain) NSObject *ivar;
+
+-(void)releaseIvars;
+@end
+
+@implementation MyPropertyClassWithReleaseInOtherInstance
+@synthesize ivar = _ivar;
+
+-(void)releaseIvars; {
+#if NON_ARC
+  [_ivar release];
+#endif
+}
+
+- (void)dealloc
+{
+  [_other releaseIvars];
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'MyPropertyClassWithReleaseInOtherInstance' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface MyPropertyClassWithNeitherReturnNorSuperDealloc : NSObject {
+  NSObject *_ivar;
+}
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyPropertyClassWithNeitherReturnNorSuperDealloc
+@synthesize ivar = _ivar;
+- (void)dealloc
+{
+}
+#if NON_ARC
+  // expected-warning@-2 {{method possibly missing a [super dealloc] call}} (From Sema)
+  // expected-warning@-3{{The '_ivar' ivar in 'MyPropertyClassWithNeitherReturnNorSuperDealloc' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+@end
+
+// <rdar://problem/6380411>: 'myproperty' has kind 'assign' and thus the
+//  assignment through the setter does not perform a release.
+
+@interface MyObject : NSObject {
+  id __unsafe_unretained _myproperty;
+}
+@property(assign) id myproperty;
+@end
+
+@implementation MyObject
+@synthesize myproperty=_myproperty; // no-warning
+- (void)dealloc {
+  // Don't claim that myproperty is released since it the property
+  // has the 'assign' attribute.
+  self.myproperty = 0; // no-warning
+#if NON_ARC
+  [super dealloc];
+#endif
+}
+@end
+
+@interface ClassWithControlFlowInRelease : NSObject {
+  BOOL _ivar1;
+}
+@property (retain) NSObject *ivar2;
+@end
+
+@implementation ClassWithControlFlowInRelease
+- (void)dealloc; {
+  if (_ivar1) {
+    // We really should warn because there is a path through -dealloc on which
+    // _ivar2 is not released.
+#if NON_ARC
+    [_ivar2 release];
+#endif
+  }
+
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar2' ivar in 'ClassWithControlFlowInRelease' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+// Don't warn when the property is nil'd out in -dealloc
+
+@interface ClassWithNildOutProperty : NSObject
+@property (retain) NSObject *ivar;
+@property (assign) int *intPtrProp;
+@end
+
+@implementation ClassWithNildOutProperty
+- (void)dealloc; {
+  self.ivar = nil;
+
+  // Make sure to handle setting a non-retainable property to 0.
+  self.intPtrProp = 0;
+#if NON_ARC
+  [super dealloc];  // no-warning
+#endif
+}
+@end
+
+// Do warn when the ivar but not the property is nil'd out in -dealloc
+
+@interface ClassWithNildOutIvar : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+@implementation ClassWithNildOutIvar
+- (void)dealloc; {
+  // Oops. Meant self.ivar = nil
+  _ivar = nil;
+
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'ClassWithNildOutIvar' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+// Do warn when the ivar is updated to a different value that is then
+// released.
+
+@interface ClassWithUpdatedIvar : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+@implementation ClassWithUpdatedIvar
+- (void)dealloc; {
+  _ivar = [[NSObject alloc] init];
+
+#if NON_ARC
+  [_ivar release];
+#endif
+
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivar' ivar in 'ClassWithUpdatedIvar' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+
+// Don't warn when the property is nil'd out with a setter in -dealloc
+
+@interface ClassWithNildOutPropertyViaSetter : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+@implementation ClassWithNildOutPropertyViaSetter
+- (void)dealloc; {
+  [self setIvar:nil];
+
+#if NON_ARC
+  [super dealloc];  // no-warning
+#endif
+}
+@end
+
+
+// Don't warn about missing releases when -dealloc helpers are called.
+
+@interface ClassWithDeallocHelpers : NSObject
+@property (retain) NSObject *ivarReleasedInMethod;
+@property (retain) NSObject *propNilledOutInMethod;
+
+@property (retain) NSObject *ivarReleasedInFunction;
+@property (retain) NSObject *propNilledOutInFunction;
+
+@property (retain) NSObject *ivarNeverReleased;
+- (void)invalidateInMethod;
+@end
+
+void ReleaseValueHelper(NSObject *iv) {
+#if NON_ARC
+  [iv release];
+#endif
+}
+
+void NilOutPropertyHelper(ClassWithDeallocHelpers *o) {
+  o.propNilledOutInFunction = nil;
+}
+
+@implementation ClassWithDeallocHelpers
+- (void)invalidateInMethod {
+#if NON_ARC
+  [_ivarReleasedInMethod release];
+#endif
+  self.propNilledOutInMethod = nil;
+}
+
+- (void)dealloc; {
+  ReleaseValueHelper(_ivarReleasedInFunction);
+  NilOutPropertyHelper(self);
+
+  [self invalidateInMethod];
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_ivarNeverReleased' ivar in 'ClassWithDeallocHelpers' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+
+// Don't warn when self in -dealloc escapes.
+
+@interface ClassWhereSelfEscapesViaMethodCall : NSObject
+@property (retain) NSObject *ivar;  // no-warning
+@end
+
+@interface ClassWhereSelfEscapesViaMethodCall (Other)
+- (void)invalidate; // In other translation unit.
+@end
+
+@implementation ClassWhereSelfEscapesViaMethodCall
+- (void)dealloc; {
+  [self invalidate];
+#if NON_ARC
+  [super dealloc];
+#endif
+} // no-warning
+@end
+
+@interface ClassWhereSelfEscapesViaPropertyAccess : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+@interface ClassWhereSelfEscapesViaPropertyAccess (Other)
+// The implementation of this property is unknown and therefore could
+// release ivar.
+@property (retain) NSObject *otherIvar;
+@end
+
+@implementation ClassWhereSelfEscapesViaPropertyAccess
+- (void)dealloc; {
+  self.otherIvar = nil;
+#if NON_ARC
+  [super dealloc];
+#endif
+} // no-warning
+@end
+
+// Don't treat self as escaping when setter called on *synthesized*
+// property.
+
+@interface ClassWhereSelfEscapesViaSynthesizedPropertyAccess : NSObject
+@property (retain) NSObject *ivar;
+@property (retain) NSObject *otherIvar;
+@end
+
+@implementation ClassWhereSelfEscapesViaSynthesizedPropertyAccess
+- (void)dealloc; {
+  self.otherIvar = nil;
+#if NON_ARC
+  [super dealloc];  // expected-warning {{The '_ivar' ivar in 'ClassWhereSelfEscapesViaSynthesizedPropertyAccess' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+
+// Don't treat calls to system headers as escapes
+
+@interface ClassWhereSelfEscapesViaCallToSystem : NSObject
+@property (retain) NSObject *ivar1;
+@property (retain) NSObject *ivar2;
+@property (retain) NSObject *ivar3;
+@property (retain) NSObject *ivar4;
+@property (retain) NSObject *ivar5;
+@property (retain) NSObject *ivar6;
+@end
+
+@implementation ClassWhereSelfEscapesViaCallToSystem
+- (void)dealloc; {
+#if NON_ARC
+  [_ivar2 release];
+  if (_ivar3) {
+    [_ivar3 release];
+  }
+#endif
+
+  [[NSRunLoop currentRunLoop] cancelPerformSelectorsWithTarget:self];
+  [[NSNotificationCenter defaultCenter] removeObserver:self];
+
+#if NON_ARC
+  [_ivar4 release];
+
+  if (_ivar5) {
+    [_ivar5 release];
+  }
+#endif
+
+  [[NSNotificationCenter defaultCenter] removeObserver:self];
+
+#if NON_ARC
+  if (_ivar6) {
+    [_ivar6 release];
+  }
+
+  [super dealloc];  // expected-warning {{The '_ivar1' ivar in 'ClassWhereSelfEscapesViaCallToSystem' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+// Don't warn when value escapes.
+
+@interface ClassWhereIvarValueEscapes : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+void ReleaseMe(id arg);
+
+@implementation ClassWhereIvarValueEscapes
+- (void)dealloc; {
+
+  ReleaseMe(_ivar);
+
+#if NON_ARC
+  [super dealloc];
+#endif
+} // no-warning
+@end
+
+// Don't warn when value is known to be nil.
+
+@interface ClassWhereIvarIsNil : NSObject
+@property (retain) NSObject *ivarIsNil;
+@end
+
+@implementation ClassWhereIvarIsNil
+- (void)dealloc; {
+
+#if NON_ARC
+  if (_ivarIsNil)
+    [_ivarIsNil release];
+
+  [super dealloc];
+#endif
+} // no-warning
+@end
+
+
+// Don't warn for non-retainable properties.
+
+@interface ClassWithNonRetainableProperty : NSObject
+@property (assign) int *ivar;  // no-warning
+@end
+
+@implementation ClassWithNonRetainableProperty
+- (void)dealloc; {
+#if NON_ARC
+  [super dealloc];
+#endif
+} // no-warning
+@end
+
+
+@interface SuperClassOfClassWithInlinedSuperDealloc : NSObject
+@property (retain) NSObject *propInSuper;
+@end
+
+@implementation SuperClassOfClassWithInlinedSuperDealloc
+- (void)dealloc {
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_propInSuper' ivar in 'SuperClassOfClassWithInlinedSuperDealloc' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface ClassWithInlinedSuperDealloc : SuperClassOfClassWithInlinedSuperDealloc
+@property (retain) NSObject *propInSub;
+@end
+
+@implementation ClassWithInlinedSuperDealloc
+- (void)dealloc {
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_propInSub' ivar in 'ClassWithInlinedSuperDealloc' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+
+@interface SuperClassOfClassWithInlinedSuperDeallocAndInvalidation : NSObject
+@property (retain) NSObject *propInSuper;
+
+- (void)invalidate;
+@end
+
+@implementation SuperClassOfClassWithInlinedSuperDeallocAndInvalidation
+
+- (void)invalidate {
+#if NON_ARC
+  [_propInSuper release];
+#endif
+  _propInSuper = nil;
+}
+
+- (void)dealloc {
+  [self invalidate];
+#if NON_ARC
+  [super dealloc]; // no-warning
+#endif
+}
+@end
+
+@interface ClassWithInlinedSuperDeallocAndInvalidation : SuperClassOfClassWithInlinedSuperDeallocAndInvalidation
+@property (retain) NSObject *propInSub;
+@end
+
+@implementation ClassWithInlinedSuperDeallocAndInvalidation
+
+- (void)invalidate {
+#if NON_ARC
+  [_propInSub release];
+#endif
+  [super invalidate];
+}
+
+- (void)dealloc {
+#if NON_ARC
+  [super dealloc]; // no-warning
+#endif
+}
+@end
+
+
+@interface SuperClassOfClassThatEscapesBeforeInliningSuper : NSObject
+@property (retain) NSObject *propInSuper;
+@end
+
+@implementation SuperClassOfClassThatEscapesBeforeInliningSuper
+
+- (void)dealloc {
+
+#if NON_ARC
+  [super dealloc]; // expected-warning {{The '_propInSuper' ivar in 'SuperClassOfClassThatEscapesBeforeInliningSuper' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface ClassThatEscapesBeforeInliningSuper : SuperClassOfClassThatEscapesBeforeInliningSuper
+@property (retain) NSObject *propInSub;
+@end
+
+@interface ClassThatEscapesBeforeInliningSuper (Other)
+- (void)invalidate; // No implementation in translation unit.
+@end
+
+@implementation ClassThatEscapesBeforeInliningSuper
+- (void)dealloc {
+  [self invalidate];
+
+#if NON_ARC
+  [super dealloc]; // no-warning
+#endif
+}
+@end
+
+
+#if NON_ARC
+@interface ReleaseIvarInField : NSObject {
+  int _tag;
+  union {
+    NSObject *field1;
+    NSObject *field2;
+  } _someUnion;
+
+  struct {
+    NSObject *field1;
+  } _someStruct;
+}
+@end
+
+@implementation ReleaseIvarInField
+- (void)dealloc {
+  if (_tag) {
+    [_someUnion.field1 release];
+  } else {
+    [_someUnion.field2 release];
+  }
+
+  [_someStruct.field1 release];
+  [super dealloc];
+}
+@end
+#endif
+
+struct SomeStruct {
+  int f;
+};
+@interface ZeroOutStructWithSetter : NSObject
+  @property(assign) struct SomeStruct s;
+@end
+
+@implementation ZeroOutStructWithSetter
+- (void)dealloc {
+  struct SomeStruct zeroedS;
+  zeroedS.f = 0;
+
+  self.s = zeroedS;
+#if NON_ARC
+  [super dealloc];
+#endif
+}
+@end
+
+#if NON_ARC
+@interface ReleaseIvarInArray : NSObject {
+  NSObject *_array[3];
+}
+@end
+
+@implementation ReleaseIvarInArray
+- (void)dealloc {
+  for (int i = 0; i < 3; i++) {
+    [_array[i] release];
+  }
+  [super dealloc];
+}
+@end
+#endif
+
+// Don't warn about missing releases for subclasses of SenTestCase or
+// for classes that are not subclasses of NSObject.
+
+@interface SenTestCase : NSObject {}
+@end
+
+@interface MyClassTest : SenTestCase
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyClassTest
+-(void)tearDown {
+#if NON_ARC
+  [_ivar release];
+#endif
+}
+
+-(void)dealloc; {
+#if NON_ARC
+  [super dealloc]; // no-warning
+#endif
+}
+@end
+
+@interface XCTestCase : NSObject {}
+@end
+
+@interface MyClassXCTest : XCTestCase
+@property (retain) NSObject *ivar;
+@end
+
+@implementation MyClassXCTest
+-(void)tearDown {
+#if NON_ARC
+  [_ivar release];
+#endif
+}
+
+-(void)dealloc; {
+#if NON_ARC
+  [super dealloc]; // no-warning
+#endif
+}
+@end
+
+
+__attribute__((objc_root_class))
+@interface NonNSObjectMissingDealloc
+@property (retain) NSObject *ivar;
+@end
+@implementation NonNSObjectMissingDealloc
+-(void)dealloc; {
+
+}
+@end
+
+// Warn about calling -dealloc rather than release by mistake.
+
+@interface CallDeallocOnRetainPropIvar : NSObject {
+  NSObject *okToDeallocDirectly;
+}
+
+@property (retain) NSObject *ivar;
+@end
+
+@implementation CallDeallocOnRetainPropIvar
+- (void)dealloc
+{
+#if NON_ARC
+  // Only warn for synthesized ivars.
+  [okToDeallocDirectly dealloc]; // no-warning
+  [_ivar dealloc];  // expected-warning {{'_ivar' should be released rather than deallocated}}
+
+  [super dealloc];
+#endif
+}
+@end
+
+// CIFilter special cases.
+// By design, -[CIFilter dealloc] releases (by calling -setValue: forKey: with
+// 'nil') all ivars (even in its *subclasses*) with names starting with
+// 'input' or that are backed by properties with names starting with 'input'.
+// The Dealloc checker needs to take particular care to not warn about missing
+// releases in this case -- if the user adds a release quiet the
+// warning it may result in an over release.
+
+@interface ImmediateSubCIFilter : CIFilter {
+  NSObject *inputIvar;
+  NSObject *nonInputIvar;
+  NSObject *notPrefixedButBackingPrefixedProperty;
+  NSObject *inputPrefixedButBackingNonPrefixedProperty;
+}
+
+@property(retain) NSObject *inputIvar;
+@property(retain) NSObject *nonInputIvar;
+@property(retain) NSObject *inputAutoSynthesizedIvar;
+@property(retain) NSObject *inputExplicitlySynthesizedToNonPrefixedIvar;
+@property(retain) NSObject *nonPrefixedPropertyBackedByExplicitlySynthesizedPrefixedIvar;
+
+@end
+
+@implementation ImmediateSubCIFilter
+@synthesize inputIvar = inputIvar;
+@synthesize nonInputIvar = nonInputIvar;
+@synthesize inputExplicitlySynthesizedToNonPrefixedIvar = notPrefixedButBackingPrefixedProperty;
+@synthesize nonPrefixedPropertyBackedByExplicitlySynthesizedPrefixedIvar = inputPrefixedButBackingNonPrefixedProperty;
+
+- (void)dealloc {
+#if NON_ARC
+  // We don't want warnings here for:
+  // inputIvar
+  // inputAutoSynthesizedIvar
+  // inputExplicitlySynthesizedToNonPrefixedIvar
+  // inputPrefixedButBackingNonPrefixedProperty
+  [super dealloc];
+  // expected-warning@-1 {{The 'nonInputIvar' ivar in 'ImmediateSubCIFilter' was retained by a synthesized property but not released before '[super dealloc]'}}
+#endif
+}
+@end
+
+@interface SubSubCIFilter : CIFilter {
+  NSObject *inputIvarInSub;
+}
+
+@property(retain) NSObject *inputIvarInSub;
+@end
+
+@implementation SubSubCIFilter
+@synthesize inputIvarInSub = inputIvarInSub;
+
+- (void)dealloc {
+// Don't warn about inputIvarInSub.
+#if NON_ARC
+  [super dealloc];
+#endif
+}
+@end
+@interface OverreleasingCIFilter : CIFilter {
+  NSObject *inputIvar;
+}
+
+@property(retain) NSObject *inputIvar;
+@end
+
+@implementation OverreleasingCIFilter
+@synthesize inputIvar = inputIvar;
+
+- (void)dealloc {
+#if NON_ARC
+  // This is an over release because CIFilter's dealloc will also release it.
+  [inputIvar release]; // expected-warning {{The 'inputIvar' ivar in 'OverreleasingCIFilter' will be released by '-[CIFilter dealloc]' but also released here}}
+  [super dealloc]; // no-warning
+  #endif
+}
+@end
+
+
+@interface NotMissingDeallocCIFilter : CIFilter {
+  NSObject *inputIvar;
+}
+
+@property(retain) NSObject *inputIvar;
+@end
+
+@implementation NotMissingDeallocCIFilter // no-warning
+@synthesize inputIvar = inputIvar;
+@end
diff --git a/test/Analysis/DeallocUseAfterFreeErrors.m b/test/Analysis/DeallocUseAfterFreeErrors.m
new file mode 100644
index 0000000000000..3feeb6d7482c7
--- /dev/null
+++ b/test/Analysis/DeallocUseAfterFreeErrors.m
@@ -0,0 +1,373 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,osx.cocoa.SuperDealloc,debug.ExprInspection -analyzer-output=text -verify %s
+
+void clang_analyzer_warnIfReached();
+
+#define nil ((id)0)
+
+typedef unsigned long NSUInteger;
+@protocol NSObject
+- (instancetype)retain;
+- (oneway void)release;
+@end
+
+@interface NSObject <NSObject> { }
+- (void)dealloc;
+- (instancetype)init;
+@end
+
+typedef struct objc_selector *SEL;
+
+//===------------------------------------------------------------------------===
+//  <rdar://problem/6953275>
+//  Check that 'self' is not referenced after calling '[super dealloc]'.
+
+@interface SuperDeallocThenReleaseIvarClass : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation SuperDeallocThenReleaseIvarClass
+- (instancetype)initWithIvar:(NSObject *)ivar {
+  self = [super init];
+  if (!self)
+    return nil;
+  _ivar = [ivar retain];
+  return self;
+}
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  [_ivar release]; // expected-warning {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+  // expected-note@-1 {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenAssignNilToIvarClass : NSObject {
+  NSObject *_delegate;
+}
+@end
+
+@implementation SuperDeallocThenAssignNilToIvarClass
+- (instancetype)initWithDelegate:(NSObject *)delegate {
+  self = [super init];
+  if (!self)
+    return nil;
+  _delegate = delegate;
+  return self;
+}
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  _delegate = nil; // expected-warning {{Use of instance variable '_delegate' after 'self' has been deallocated}}
+      // expected-note@-1 {{Use of instance variable '_delegate' after 'self' has been deallocated}}
+}
+@end
+
+
+struct SomeStruct {
+  int f;
+};
+
+@interface SuperDeallocThenAssignIvarField : NSObject {
+  struct SomeStruct _s;
+}
+@end
+
+@implementation SuperDeallocThenAssignIvarField
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  _s.f = 7; // expected-warning {{Use of instance variable '_s' after 'self' has been deallocated}}
+      // expected-note@-1 {{Use of instance variable '_s' after 'self' has been deallocated}}
+}
+@end
+
+@interface OtherClassWithIvar {
+@public
+  int _otherIvar;
+}
+@end;
+
+@interface SuperDeallocThenAssignIvarIvar : NSObject {
+  OtherClassWithIvar *_ivar;
+}
+@end
+
+@implementation SuperDeallocThenAssignIvarIvar
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  _ivar->_otherIvar = 7; // expected-warning {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+      // expected-note@-1 {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenAssignSelfIvar : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation SuperDeallocThenAssignSelfIvar
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  self->_ivar = nil; // expected-warning {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+      // expected-note@-1 {{Use of instance variable '_ivar' after 'self' has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenReleasePropertyClass : NSObject { }
+@property (retain) NSObject *ivar;
+@end
+
+@implementation SuperDeallocThenReleasePropertyClass
+- (instancetype)initWithProperty:(NSObject *)ivar {
+  self = [super init];
+  if (!self)
+    return nil;
+  self.ivar = ivar;
+  return self;
+}
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  self.ivar = nil; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenAssignNilToPropertyClass : NSObject { }
+@property (assign) NSObject *delegate;
+@end
+
+@implementation SuperDeallocThenAssignNilToPropertyClass
+- (instancetype)initWithDelegate:(NSObject *)delegate {
+  self = [super init];
+  if (!self)
+    return nil;
+  self.delegate = delegate;
+  return self;
+}
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  self.delegate = nil; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenCallInstanceMethodClass : NSObject { }
+- (void)_invalidate;
+@end
+
+@implementation SuperDeallocThenCallInstanceMethodClass
+- (void)_invalidate {
+}
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  [self _invalidate]; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenCallNonObjectiveCMethodClass : NSObject { }
+@end
+
+static void _invalidate(NSObject *object) {
+  (void)object;
+}
+
+@implementation SuperDeallocThenCallNonObjectiveCMethodClass
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  _invalidate(self); // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+@interface SuperDeallocThenCallObjectiveClassMethodClass : NSObject { }
+@end
+
+@implementation SuperDeallocThenCallObjectiveClassMethodClass
++ (void) invalidate:(id)arg; {
+}
+
+- (void)dealloc {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  [SuperDeallocThenCallObjectiveClassMethodClass invalidate:self]; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+@interface TwoSuperDeallocCallsClass : NSObject {
+  NSObject *_ivar;
+}
+- (void)_invalidate;
+@end
+
+@implementation TwoSuperDeallocCallsClass
+- (void)_invalidate {
+}
+- (void)dealloc {
+  if (_ivar) { // expected-note {{Taking false branch}}
+    [_ivar release];
+    [super dealloc];
+    return;
+  }
+  [super dealloc];    // expected-note {{[super dealloc] called here}}
+  [self _invalidate]; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Warn about calling [super dealloc] twice due to missing return statement.
+
+@interface MissingReturnCausesDoubleSuperDeallocClass : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation MissingReturnCausesDoubleSuperDeallocClass
+- (void)dealloc {
+  if (_ivar) { // expected-note {{Taking true branch}}
+    [_ivar release];
+    [super dealloc]; // expected-note {{[super dealloc] called here}}
+    // return;
+  }
+  [super dealloc]; // expected-warning{{[super dealloc] should not be called multiple times}}
+  // expected-note@-1{{[super dealloc] should not be called multiple times}}
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Warn about calling [super dealloc] twice in two different methods.
+
+@interface SuperDeallocInOtherMethodClass : NSObject {
+  NSObject *_ivar;
+}
+- (void)_cleanup;
+@end
+
+@implementation SuperDeallocInOtherMethodClass
+- (void)_cleanup {
+  [_ivar release];
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+}
+- (void)dealloc {
+  [self _cleanup]; // expected-note {{Calling '_cleanup'}}
+  //expected-note@-1 {{Returning from '_cleanup'}}
+  [super dealloc]; // expected-warning {{[super dealloc] should not be called multiple times}}
+  // expected-note@-1 {{[super dealloc] should not be called multiple times}}
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Do not warn about calling [super dealloc] recursively for different objects
+// of the same type with custom retain counting.
+//
+// A class that contains an ivar of itself with custom retain counting (such
+// as provided by _OBJC_SUPPORTED_INLINE_REFCNT_WITH_DEALLOC2MAIN) can generate
+// a false positive that [super dealloc] is called twice if each object instance
+// is not tracked separately by the checker. This test case is just a simple
+// approximation to trigger the false positive.
+
+@class ClassWithOwnIvarInstanceClass;
+@interface ClassWithOwnIvarInstanceClass : NSObject {
+  ClassWithOwnIvarInstanceClass *_ivar;
+  NSUInteger _retainCount;
+}
+@end
+
+@implementation ClassWithOwnIvarInstanceClass
+- (instancetype)retain {
+  ++_retainCount;
+  return self;
+}
+- (oneway void)release {
+  --_retainCount;
+  if (!_retainCount)
+    [self dealloc];
+}
+- (void)dealloc {
+  [_ivar release];
+  [super dealloc]; // no warning: different instances of same class
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Do not warn about calling [super dealloc] twice if +dealloc is a class
+// method.
+
+@interface SuperDeallocClassMethodIgnoredClass : NSObject { }
++ (void)dealloc;
+@end
+
+@implementation SuperDeallocClassMethodIgnoredClass
++ (void)dealloc { }
+@end
+
+@interface SuperDeallocClassMethodIgnoredSubClass : NSObject { }
++ (void)dealloc;
+@end
+
+@implementation SuperDeallocClassMethodIgnoredSubClass
++ (void)dealloc {
+  [super dealloc];
+  [super dealloc]; // no warning: class method
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Do not warn about calling [super dealloc] twice if when the analyzer has
+// inlined the call to its super deallocator.
+
+@interface SuperClassCallingSuperDealloc : NSObject {
+  NSObject *_ivar;
+}
+@end
+
+@implementation SuperClassCallingSuperDealloc
+- (void)dealloc; {
+  [_ivar release]; // no-warning
+
+  [super dealloc];
+}
+@end
+
+@interface SubclassCallingSuperDealloc : SuperClassCallingSuperDealloc
+@end
+
+@implementation SubclassCallingSuperDealloc
+- (void)dealloc; {
+  [super dealloc];
+}
+@end
+
+//===------------------------------------------------------------------------===
+// Treat calling [super dealloc] twice as as a sink.
+
+@interface CallingSuperDeallocTwiceIsSink : NSObject
+@end
+
+@implementation CallingSuperDeallocTwiceIsSink
+- (void)dealloc; {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  [super dealloc]; // expected-warning {{[super dealloc] should not be called multiple times}}
+  // expected-note@-1 {{[super dealloc] should not be called multiple times}}
+
+  clang_analyzer_warnIfReached(); // no-warning
+}
+@end
+
+
+//===------------------------------------------------------------------------===
+// Test path notes with intervening method call on self.
+
+@interface InterveningMethodCallOnSelf : NSObject
+@end
+
+@implementation InterveningMethodCallOnSelf
+- (void)anotherMethod {
+}
+
+- (void)dealloc; {
+  [super dealloc]; // expected-note {{[super dealloc] called here}}
+  [self anotherMethod]; // expected-warning {{use of 'self' after it has been deallocated}}
+      // expected-note@-1 {{use of 'self' after it has been deallocated}}
+  [super dealloc];
+}
+@end
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx-std-suppression.h b/test/Analysis/Inputs/system-header-simulator-cxx-std-suppression.h
new file mode 100644
index 0000000000000..dc53af269c9c2
--- /dev/null
+++ b/test/Analysis/Inputs/system-header-simulator-cxx-std-suppression.h
@@ -0,0 +1,146 @@
+// This is a fake system header with divide-by-zero bugs introduced in
+// c++ std library functions. We use these bugs to test hard-coded
+// suppression of diagnostics within standard library functions that are known
+// to produce false positives.
+
+#pragma clang system_header
+
+typedef unsigned char uint8_t;
+
+typedef __typeof__(sizeof(int)) size_t;
+void *memmove(void *s1, const void *s2, size_t n);
+
+namespace std {
+
+  template <class _Tp>
+  class allocator {
+  public:
+    void deallocate(void *p) {
+      ::delete p;
+    }
+  };
+
+  template <class _Alloc>
+  class allocator_traits {
+  public:
+    static void deallocate(void *p) {
+      _Alloc().deallocate(p);
+    }
+  };
+
+  template <class _Tp, class _Alloc>
+  class __list_imp
+  {};
+
+  template <class _Tp, class _Alloc = allocator<_Tp> >
+  class list
+  : private __list_imp<_Tp, _Alloc>
+  {
+  public:
+    void pop_front() {
+      // Fake use-after-free.
+      // No warning is expected as we are suppressing warning coming
+      // out of std::list.
+      int z = 0;
+      z = 5/z;
+    }
+    bool empty() const;
+  };
+
+  // basic_string
+  template<class _CharT, class _Alloc = allocator<_CharT> >
+  class __attribute__ ((__type_visibility__("default"))) basic_string {
+    bool isLong;
+    union {
+      _CharT localStorage[4];
+      _CharT *externalStorage;
+
+      void assignExternal(_CharT *newExternal) {
+        externalStorage = newExternal;
+      }
+    } storage;
+
+    typedef allocator_traits<_Alloc> __alloc_traits;
+
+  public:
+    basic_string();
+
+    void push_back(int c) {
+      // Fake error trigger.
+      // No warning is expected as we are suppressing warning coming
+      // out of std::basic_string.
+      int z = 0;
+      z = 5/z;
+    }
+
+    _CharT *getBuffer() {
+      return isLong ? storage.externalStorage : storage.localStorage;
+    }
+
+    basic_string &operator +=(int c) {
+      // Fake deallocate stack-based storage.
+      // No warning is expected as we are suppressing warnings within
+      // std::basic_string.
+      __alloc_traits::deallocate(getBuffer());
+    }
+
+    basic_string &operator =(const basic_string &other) {
+      // Fake deallocate stack-based storage, then use the variable in the
+      // same union.
+      // No warning is expected as we are suppressing warnings within
+      // std::basic_string.
+      __alloc_traits::deallocate(getBuffer());
+      storage.assignExternal(new _CharT[4]);
+    }
+  };
+
+template<class _Engine, class _UIntType>
+class __independent_bits_engine {
+public:
+  // constructors and seeding functions
+  __independent_bits_engine(_Engine& __e, size_t __w);
+};
+
+template<class _Engine, class _UIntType>
+__independent_bits_engine<_Engine, _UIntType>
+    ::__independent_bits_engine(_Engine& __e, size_t __w)
+{
+  // Fake error trigger.
+  // No warning is expected as we are suppressing warning coming
+  // out of std::__independent_bits_engine.
+  int z = 0;
+  z = 5/z;
+}
+
+#if __has_feature(cxx_decltype)
+typedef decltype(nullptr) nullptr_t;
+
+template<class _Tp>
+class shared_ptr
+{
+public:
+  constexpr shared_ptr(nullptr_t);
+  explicit shared_ptr(_Tp* __p);
+
+  shared_ptr(shared_ptr&& __r) { }
+
+  ~shared_ptr();
+
+  shared_ptr& operator=(shared_ptr&& __r) {
+    // Fake error trigger.
+    // No warning is expected as we are suppressing warning coming
+    // out of std::shared_ptr.
+    int z = 0;
+    z = 5/z;
+  }
+};
+
+template<class _Tp>
+inline
+constexpr
+shared_ptr<_Tp>::shared_ptr(nullptr_t) {
+}
+
+#endif // __has_feature(cxx_decltype)
+}
+
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx.h b/test/Analysis/Inputs/system-header-simulator-cxx.h
index f9049c3ae9e72..b32d200364b18 100644
--- a/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -7,6 +7,9 @@
 
 typedef unsigned char uint8_t;
 
+typedef __typeof__(sizeof(int)) size_t;
+void *memmove(void *s1, const void *s2, size_t n);
+
 namespace std {
   template <class T1, class T2>
   struct pair {
@@ -104,118 +107,127 @@ namespace std {
     const _E* end()   const {return __begin_ + __size_;}
   };
 
-  template<class InputIter, class OutputIter>
-  OutputIter copy(InputIter II, InputIter IE, OutputIter OI) {
-    while (II != IE)
-      *OI++ = *II++;
-    return OI;
-  }
+  template <bool, class _Tp = void> struct enable_if {};
+  template <class _Tp> struct enable_if<true, _Tp> {typedef _Tp type;};
 
-  struct input_iterator_tag { };
-  struct output_iterator_tag { };
-  struct forward_iterator_tag : public input_iterator_tag { };
-  struct bidirectional_iterator_tag : public forward_iterator_tag { };
-  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
+  template <class _Tp, _Tp __v>
+  struct integral_constant
+  {
+      static const _Tp      value = __v;
+      typedef _Tp               value_type;
+      typedef integral_constant type;
 
-  template <class _Tp>
-  class allocator {
-  public:
-    void deallocate(void *p) {
-      ::delete p;
-    }
-  };
+     operator value_type() const {return value;}
 
-  template <class _Alloc>
-  class allocator_traits {
-  public:
-    static void deallocate(void *p) {
-      _Alloc().deallocate(p);
-    }
+     value_type operator ()() const {return value;}
   };
 
-  template <class _Tp, class _Alloc>
-  class __list_imp
-  {};
+  template <class _Tp, _Tp __v>
+  const _Tp integral_constant<_Tp, __v>::value;
 
-  template <class _Tp, class _Alloc = allocator<_Tp> >
-  class list
-  : private __list_imp<_Tp, _Alloc>
-  {
-  public:
-    void pop_front() {
-      // Fake use-after-free.
-      // No warning is expected as we are suppressing warning coming
-      // out of std::list.
-      int z = 0;
-      z = 5/z;
-    }
-    bool empty() const;
-  };
+    template <class _Tp, class _Arg>
+    struct is_trivially_assignable
+      : integral_constant<bool, __is_trivially_assignable(_Tp, _Arg)>
+    {
+    };
 
-  // basic_string
-  template<class _CharT, class _Alloc = allocator<_CharT> >
-  class __attribute__ ((__type_visibility__("default"))) basic_string {
-    bool isLong;
-    union {
-      _CharT localStorage[4];
-      _CharT *externalStorage;
+  typedef integral_constant<bool,true>  true_type;
+  typedef integral_constant<bool,false> false_type;
 
-      void assignExternal(_CharT *newExternal) {
-        externalStorage = newExternal;
-      }
-    } storage;
+  template <class _Tp> struct is_const            : public false_type {};
+  template <class _Tp> struct is_const<_Tp const> : public true_type {};
 
-    typedef allocator_traits<_Alloc> __alloc_traits;
+  template <class _Tp> struct  is_reference        : public false_type {};
+  template <class _Tp> struct  is_reference<_Tp&>  : public true_type {};
 
-  public:
-    basic_string();
-
-    void push_back(int c) {
-      // Fake error trigger.
-      // No warning is expected as we are suppressing warning coming
-      // out of std::basic_string.
-      int z = 0;
-      z = 5/z;
-    }
+  template <class _Tp, class _Up> struct  is_same           : public false_type {};
+  template <class _Tp>            struct  is_same<_Tp, _Tp> : public true_type {};
+
+  template <class _Tp, bool = is_const<_Tp>::value || is_reference<_Tp>::value    >
+  struct __add_const             {typedef _Tp type;};
+
+  template <class _Tp>
+  struct __add_const<_Tp, false> {typedef const _Tp type;};
 
-    _CharT *getBuffer() {
-      return isLong ? storage.externalStorage : storage.localStorage;
+  template <class _Tp> struct add_const {typedef typename __add_const<_Tp>::type type;};
+
+  template <class _Tp> struct  remove_const            {typedef _Tp type;};
+  template <class _Tp> struct  remove_const<const _Tp> {typedef _Tp type;};
+
+  template <class _Tp> struct  add_lvalue_reference    {typedef _Tp& type;};
+
+  template <class _Tp> struct is_trivially_copy_assignable
+      : public is_trivially_assignable<typename add_lvalue_reference<_Tp>::type,
+            typename add_lvalue_reference<typename add_const<_Tp>::type>::type> {};
+
+    template<class InputIter, class OutputIter>
+    OutputIter __copy(InputIter II, InputIter IE, OutputIter OI) {
+      while (II != IE)
+        *OI++ = *II++;
+
+      return OI;
     }
 
-    basic_string &operator +=(int c) {
-      // Fake deallocate stack-based storage.
-      // No warning is expected as we are suppressing warnings within
-      // std::basic_string.
-      __alloc_traits::deallocate(getBuffer());
+  template <class _Tp, class _Up>
+  inline
+  typename enable_if
+  <
+      is_same<typename remove_const<_Tp>::type, _Up>::value &&
+      is_trivially_copy_assignable<_Up>::value,
+      _Up*
+  >::type __copy(_Tp* __first, _Tp* __last, _Up* __result) {
+      size_t __n = __last - __first;
+
+      if (__n > 0)
+        memmove(__result, __first, __n * sizeof(_Up));
+
+      return __result + __n;
     }
 
-    basic_string &operator =(const basic_string &other) {
-      // Fake deallocate stack-based storage, then use the variable in the
-      // same union.
-      // No warning is expected as we are suppressing warnings within
-      // std::basic_string.
-      __alloc_traits::deallocate(getBuffer());
-      storage.assignExternal(new _CharT[4]);
+  template<class InputIter, class OutputIter>
+  OutputIter copy(InputIter II, InputIter IE, OutputIter OI) {
+    return __copy(II, IE, OI);
+  }
+
+  template <class _BidirectionalIterator, class _OutputIterator>
+  inline
+  _OutputIterator
+  __copy_backward(_BidirectionalIterator __first, _BidirectionalIterator __last,
+                  _OutputIterator __result)
+  {
+      while (__first != __last)
+          *--__result = *--__last;
+      return __result;
+  }
+
+  template <class _Tp, class _Up>
+  inline
+  typename enable_if
+  <
+      is_same<typename remove_const<_Tp>::type, _Up>::value &&
+      is_trivially_copy_assignable<_Up>::value,
+      _Up*
+  >::type __copy_backward(_Tp* __first, _Tp* __last, _Up* __result) {
+      size_t __n = __last - __first;
+
+    if (__n > 0)
+    {
+        __result -= __n;
+        memmove(__result, __first, __n * sizeof(_Up));
     }
-  };
+    return __result;
+  }
 
-template<class _Engine, class _UIntType>
-class __independent_bits_engine {
-public:
-  // constructors and seeding functions
-  __independent_bits_engine(_Engine& __e, size_t __w);
-};
-
-template<class _Engine, class _UIntType>
-__independent_bits_engine<_Engine, _UIntType>
-    ::__independent_bits_engine(_Engine& __e, size_t __w)
-{
-  // Fake error trigger.
-  // No warning is expected as we are suppressing warning coming
-  // out of std::basic_string.
-  int z = 0;
-  z = 5/z;
-}
+  template<class InputIter, class OutputIter>
+  OutputIter copy_backward(InputIter II, InputIter IE, OutputIter OI) {
+    return __copy_backward(II, IE, OI);
+  }
+
+  struct input_iterator_tag { };
+  struct output_iterator_tag { };
+  struct forward_iterator_tag : public input_iterator_tag { };
+  struct bidirectional_iterator_tag : public forward_iterator_tag { };
+  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
 
 }
 
diff --git a/test/Analysis/Inputs/system-header-simulator-for-nullability.h b/test/Analysis/Inputs/system-header-simulator-for-nullability.h
new file mode 100644
index 0000000000000..8d49f323bc1d6
--- /dev/null
+++ b/test/Analysis/Inputs/system-header-simulator-for-nullability.h
@@ -0,0 +1,43 @@
+#pragma clang system_header
+
+#define nil 0
+#define BOOL int
+
+#define NS_ASSUME_NONNULL_BEGIN _Pragma("clang assume_nonnull begin")
+#define NS_ASSUME_NONNULL_END   _Pragma("clang assume_nonnull end")
+
+NS_ASSUME_NONNULL_BEGIN
+
+typedef struct _NSZone NSZone;
+
+@protocol NSObject
++ (instancetype)alloc;
+- (instancetype)init;
+- (instancetype)autorelease;
+@end
+
+@protocol NSCopying
+- (id)copyWithZone:(nullable NSZone *)zone;
+@end
+
+@protocol NSMutableCopying
+- (id)mutableCopyWithZone:(nullable NSZone *)zone;
+@end
+
+__attribute__((objc_root_class))
+@interface
+NSObject<NSObject>
+@end
+
+@interface NSString : NSObject<NSCopying>
+- (BOOL)isEqualToString : (NSString *)aString;
+- (NSString *)stringByAppendingString:(NSString *)aString;
+@end
+
+void NSSystemFunctionTakingNonnull(NSString *s);
+
+@interface NSSystemClass : NSObject
+- (void) takesNonnull:(NSString *)s;
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/test/Analysis/Inputs/system-header-simulator-for-objc-dealloc.h b/test/Analysis/Inputs/system-header-simulator-for-objc-dealloc.h
new file mode 100644
index 0000000000000..231c0bf5640ec
--- /dev/null
+++ b/test/Analysis/Inputs/system-header-simulator-for-objc-dealloc.h
@@ -0,0 +1,35 @@
+#pragma clang system_header
+
+#define nil ((id)0)
+
+typedef signed char BOOL;
+@protocol NSObject
+- (BOOL)isEqual:(id)object;
+- (Class)class;
+@end
+
+@interface NSObject <NSObject> {}
++ (instancetype)alloc;
+- (void)dealloc;
+- (id)init;
+- (id)retain;
+- (oneway void)release;
+@end
+
+@interface NSRunLoop : NSObject
++ (NSRunLoop *)currentRunLoop;
+- (void)cancelPerformSelectorsWithTarget:(id)target;
+@end
+
+@interface NSNotificationCenter : NSObject
++ (NSNotificationCenter *)defaultCenter;
+- (void)removeObserver:(id)observer;
+@end
+
+typedef struct objc_selector *SEL;
+
+void _Block_release(const void *aBlock);
+#define Block_release(...) _Block_release((const void *)(__VA_ARGS__))
+
+@interface CIFilter : NSObject
+@end
diff --git a/test/Analysis/MPIMock.h b/test/Analysis/MPIMock.h
new file mode 100644
index 0000000000000..01d2d42fc58a3
--- /dev/null
+++ b/test/Analysis/MPIMock.h
@@ -0,0 +1,55 @@
+// Message Passing Interface mock header. Mocks MPI constants and functions, in
+// order to make them available in distinct integration test files.
+
+#define NULL 0
+
+// mock types
+typedef int MPI_Datatype;
+typedef int MPI_Comm;
+typedef int MPI_Request;
+typedef int MPI_Status;
+typedef int MPI_Op;
+typedef int int8_t;
+typedef int uint8_t;
+typedef int uint16_t;
+typedef int int64_t;
+namespace std { template<class T> struct complex { T real; T imag; }; }
+
+// mock constants
+#define MPI_DATATYPE_NULL 0
+#define MPI_CHAR 0
+#define MPI_BYTE 0
+#define MPI_INT 0
+#define MPI_LONG 0
+#define MPI_LONG_DOUBLE 0
+#define MPI_UNSIGNED 0
+#define MPI_INT8_T 0
+#define MPI_UINT8_T 0
+#define MPI_UINT16_T 0
+#define MPI_C_LONG_DOUBLE_COMPLEX 0
+#define MPI_FLOAT 0
+#define MPI_DOUBLE 0
+#define MPI_CXX_BOOL 0
+#define MPI_CXX_FLOAT_COMPLEX 0
+#define MPI_CXX_DOUBLE_COMPLEX 0
+#define MPI_CXX_LONG_DOUBLE_COMPLEX 0
+#define MPI_IN_PLACE 0
+#define MPI_COMM_WORLD 0
+#define MPI_STATUS_IGNORE 0
+#define MPI_STATUSES_IGNORE 0
+#define MPI_SUM 0
+
+// mock functions
+int MPI_Comm_size(MPI_Comm, int *);
+int MPI_Comm_rank(MPI_Comm, int *);
+int MPI_Send(const void *, int, MPI_Datatype, int, int, MPI_Comm);
+int MPI_Recv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Status *);
+int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm,
+    MPI_Request *);
+int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *);
+int MPI_Wait(MPI_Request *, MPI_Status *);
+int MPI_Waitall(int, MPI_Request[], MPI_Status[]);
+int MPI_Reduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm);
+int MPI_Ireduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm,
+    MPI_Request *);
+int MPI_Bcast(void *, int count, MPI_Datatype, int, MPI_Comm);
diff --git a/test/Analysis/MemRegion.cpp b/test/Analysis/MemRegion.cpp
new file mode 100644
index 0000000000000..992b7f1f97673
--- /dev/null
+++ b/test/Analysis/MemRegion.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -verify %s
+
+#include "MPIMock.h"
+
+// Use MPI-Checker to test 'getDescriptiveName', as the checker uses the
+// function for diagnostics.
+void testGetDescriptiveName() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1;
+  MPI_Wait(&sendReq1, MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName2() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1[10][10][10];
+  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1[1][7][9]' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName3() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req;
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName4() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req[2][2]; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req[0][1];
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req[0][1]' has no matching nonblocking call.}}
+}
+
+void testGetDescriptiveName5() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req; } ReqStructInner;
+  typedef struct  { ReqStructInner req; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req.req;
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req.req' has no matching nonblocking call.}}
+}
diff --git a/test/Analysis/MissingDealloc.m b/test/Analysis/MissingDealloc.m
index b465959791b07..248dc514c8763 100644
--- a/test/Analysis/MissingDealloc.m
+++ b/test/Analysis/MissingDealloc.m
@@ -1,5 +1,13 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=alpha.osx.cocoa.Dealloc %s -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 -analyze -analyzer-checker=osx.cocoa.Dealloc -fblocks -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=osx.cocoa.Dealloc -fblocks -verify -triple x86_64-apple-darwin10 -fobjc-arc %s
+
+#define NON_ARC !__has_feature(objc_arc)
+
+// No diagnostics expected under ARC.
+#if !NON_ARC
+  // expected-no-diagnostics
+#endif
+
 typedef signed char BOOL;
 @protocol NSObject
 - (BOOL)isEqual:(id)object;
@@ -13,23 +21,93 @@ typedef signed char BOOL;
 
 typedef struct objc_selector *SEL;
 
-// <rdar://problem/6380411>: 'myproperty' has kind 'assign' and thus the
-//  assignment through the setter does not perform a release.
+//===------------------------------------------------------------------------===
+// Do not warn about missing -dealloc method.  Not enough context to know
+// whether the ivar is retained or not.
 
-@interface MyObject : NSObject {
-  id _myproperty;  
+@interface MissingDeallocWithIvar : NSObject {
+  NSObject *_ivar;
 }
-@property(assign) id myproperty;
 @end
 
-@implementation MyObject
-@synthesize myproperty=_myproperty; // no-warning
-- (void)dealloc {
-  self.myproperty = 0;
-  [super dealloc]; 
+@implementation MissingDeallocWithIvar
+@end
+
+//===------------------------------------------------------------------------===
+// Do not warn about missing -dealloc method.  These properties are not
+// retained or synthesized.
+
+@interface MissingDeallocWithIntProperty : NSObject
+@property (assign) int ivar;
+@end
+
+@implementation MissingDeallocWithIntProperty
+@end
+
+@interface MissingDeallocWithSELProperty : NSObject
+@property (assign) SEL ivar;
+@end
+
+@implementation MissingDeallocWithSELProperty
+@end
+
+//===------------------------------------------------------------------------===
+// Warn about missing -dealloc method.
+
+@interface MissingDeallocWithCopyProperty : NSObject
+@property (copy) NSObject *ivar;
+@end
+
+#if NON_ARC
+// expected-warning@+2{{'MissingDeallocWithCopyProperty' lacks a 'dealloc' instance method but must release '_ivar'}}
+#endif
+@implementation MissingDeallocWithCopyProperty
+@end
+
+@interface MissingDeallocWithRetainProperty : NSObject
+@property (retain) NSObject *ivar;
+@end
+
+#if NON_ARC
+// expected-warning@+2{{'MissingDeallocWithRetainProperty' lacks a 'dealloc' instance method but must release '_ivar'}}
+#endif
+@implementation MissingDeallocWithRetainProperty
+@end
+
+@interface MissingDeallocWithMultipleProperties : NSObject
+@property (retain) NSObject *ivar1;
+@property (retain) NSObject *ivar2;
+@end
+
+#if NON_ARC
+// expected-warning@+2{{'MissingDeallocWithMultipleProperties' lacks a 'dealloc' instance method but must release '_ivar1' and others}}
+#endif
+@implementation MissingDeallocWithMultipleProperties
+@end
+
+@interface MissingDeallocWithIVarAndRetainProperty : NSObject {
+  NSObject *_ivar2;
 }
+@property (retain) NSObject *ivar1;
+@end
+
+#if NON_ARC
+// expected-warning@+2{{'MissingDeallocWithIVarAndRetainProperty' lacks a 'dealloc' instance method but must release '_ivar1'}}
+#endif
+@implementation MissingDeallocWithIVarAndRetainProperty
+@end
+
+@interface MissingDeallocWithReadOnlyRetainedProperty : NSObject
+@property (readonly,retain) NSObject *ivar;
 @end
 
+#if NON_ARC
+// expected-warning@+2{{'MissingDeallocWithReadOnlyRetainedProperty' lacks a 'dealloc' instance method but must release '_ivar'}}
+#endif
+@implementation MissingDeallocWithReadOnlyRetainedProperty
+@end
+
+
 //===------------------------------------------------------------------------===
 //  Don't warn about iVars that are selectors.
 
@@ -65,27 +143,6 @@ IBOutlet NSWindow *window;
 @end
 
 //===------------------------------------------------------------------------===
-// <rdar://problem/6380411>
-// Was bogus warning: "The '_myproperty' instance variable was not retained by a
-//  synthesized property but was released in 'dealloc'"
-
-@interface MyObject_rdar6380411 : NSObject {
-    id _myproperty;
-}
-@property(assign) id myproperty;
-@end
-
-@implementation MyObject_rdar6380411
-@synthesize myproperty=_myproperty;
-- (void)dealloc {
-    // Don't claim that myproperty is released since it the property
-    // has the 'assign' attribute.
-    self.myproperty = 0; // no-warning
-    [super dealloc];
-}
-@end
-
-//===------------------------------------------------------------------------===
 // PR 3187: http://llvm.org/bugs/show_bug.cgi?id=3187
 // - Disable the missing -dealloc check for classes that subclass SenTestCase
 
@@ -97,6 +154,9 @@ IBOutlet NSWindow *window;
 @interface MyClassTest : SenTestCase {
   NSString *resourcePath;
 }
+
+@property (retain) NSObject *ivar;
+
 @end
 
 @interface NSBundle : NSObject {}
@@ -112,3 +172,15 @@ IBOutlet NSWindow *window;
   // do something which uses resourcepath
 }
 @end
+
+//===------------------------------------------------------------------------===
+// Don't warn for clases that aren't subclasses of NSObject
+
+__attribute__((objc_root_class))
+@interface NonNSObjectMissingDealloc
+@property (retain) NSObject *ivar;
+@end
+@implementation NonNSObjectMissingDealloc
+@end
+
+// CHECK: 4 warnings generated.
diff --git a/test/Analysis/NSString.m b/test/Analysis/NSString.m
index e3900334831f5..799f813022ebd 100644
--- a/test/Analysis/NSString.m
+++ b/test/Analysis/NSString.m
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple i386-apple-darwin10 -analyze -analyzer-checker=core,osx.cocoa.NilArg,osx.cocoa.RetainCount,alpha.core -analyzer-store=region -analyzer-constraints=range -verify -Wno-objc-root-class %s
 // RUN: %clang_cc1 -triple i386-apple-darwin10 -analyze -analyzer-checker=core,osx.cocoa.NilArg,osx.cocoa.RetainCount,alpha.core -analyzer-store=region -analyzer-constraints=range -analyzer-config mode=shallow -verify -Wno-objc-root-class %s
 // RUN: %clang_cc1 -DTEST_64 -triple x86_64-apple-darwin10 -analyze -analyzer-checker=core,osx.cocoa.NilArg,osx.cocoa.RetainCount,alpha.core -analyzer-store=region -analyzer-constraints=range -verify -Wno-objc-root-class %s
-
+// RUN: %clang_cc1 -DOSATOMIC_USE_INLINED -triple i386-apple-darwin10 -analyze -analyzer-checker=core,osx.cocoa.NilArg,osx.cocoa.RetainCount,alpha.core -analyzer-store=region -analyzer-constraints=range -verify -Wno-objc-root-class %s
 
 //===----------------------------------------------------------------------===//
 // The following code is reduced using delta-debugging from
@@ -279,9 +279,22 @@ id testSharedClassFromFunction() {
   return [[SharedClass alloc] _init]; // no-warning
 }
 
+#if !(defined(OSATOMIC_USE_INLINED) && OSATOMIC_USE_INLINED)
 // Test OSCompareAndSwap
 _Bool OSAtomicCompareAndSwapPtr( void *__oldValue, void *__newValue, void * volatile *__theValue );
 extern BOOL objc_atomicCompareAndSwapPtr(id predicate, id replacement, volatile id *objectLocation);
+#else
+// Test that the body farm models are still used even when a body is available.
+_Bool opaque_OSAtomicCompareAndSwapPtr( void *__oldValue, void *__newValue, void * volatile *__theValue );
+_Bool OSAtomicCompareAndSwapPtr( void *__oldValue, void *__newValue, void * volatile *__theValue ) {
+  return opaque_OSAtomicCompareAndSwapPtr(__oldValue, __newValue, __theValue);
+}
+
+extern BOOL opaque_objc_atomicCompareAndSwapPtr(id predicate, id replacement, volatile id *objectLocation);
+extern BOOL objc_atomicCompareAndSwapPtr(id predicate, id replacement, volatile id *objectLocation) {
+  return opaque_objc_atomicCompareAndSwapPtr(predicate, replacement, objectLocation);
+}
+#endif
 
 void testOSCompareAndSwap() {
   NSString *old = 0;
diff --git a/test/Analysis/PR24184.cpp b/test/Analysis/PR24184.cpp
index db0df6f9c39fd..54eae563e7dad 100644
--- a/test/Analysis/PR24184.cpp
+++ b/test/Analysis/PR24184.cpp
@@ -12,7 +12,7 @@ int a;
 typedef int *vcreate_t(int *, DATA_TYPE, int, int);
 void fn1(unsigned, unsigned) {
   char b = 0;
-  for (; 1; a++, &b + a * 0) // expected-warning{{Pointer arithmetic done on non-array variables means reliance on memory layout, which is dangerous}}
+  for (; 1; a++, &b + a * 0)
     ;
 }
 
@@ -55,7 +55,7 @@ void fn1_1(void *p1, const void *p2) { p1 != p2; }
 void fn2_1(uint32_t *p1, unsigned char *p2, uint32_t p3) {
   unsigned i = 0;
   for (0; i < p3; i++)
-    fn1_1(p1 + i, p2 + i * 0);    // expected-warning{{Pointer arithmetic done on non-array variables means reliance on memory layout, which is dangerous}}
+    fn1_1(p1 + i, p2 + i * 0);
 }
 
 struct A_1 {
diff --git a/test/Analysis/PR2978.m b/test/Analysis/PR2978.m
index 8f76120ccbc62..b609da5aac24b 100644
--- a/test/Analysis/PR2978.m
+++ b/test/Analysis/PR2978.m
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,alpha.core -analyzer-checker=alpha.osx.cocoa.Dealloc %s -verify
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,alpha.core,osx.cocoa.Dealloc %s -verify
 
 // Tests for the checker which checks missing/extra ivar 'release' calls 
 // in dealloc.
 
 @interface NSObject
 - (void)release;
-- dealloc;
+- (void)dealloc;
 @end
 
 @interface MyClass : NSObject {
@@ -14,48 +14,102 @@
   id _Y;
   id _Z;
   id _K;
+  id _L;
   id _N;
   id _M;
+  id _P;
+  id _Q;
+  id _R;
+  id _S;
   id _V;
   id _W;
+
+  MyClass *_other;
+
+  id _nonPropertyIvar;
 }
 @property(retain) id X;
 @property(retain) id Y;
 @property(assign) id Z;
 @property(assign) id K;
+@property(weak) id L;
 @property(readonly) id N;
 @property(retain) id M;
+@property(weak) id P;
+@property(weak) id Q;
+@property(retain) id R;
+@property(weak, readonly) id S;
+
+@property(assign, readonly) id T; // Shadowed in class extension
+@property(assign) id U;
+
 @property(retain) id V;
 @property(retain) id W;
 -(id) O;
 -(void) setO: (id) arg;
 @end
 
+@interface MyClass ()
+// Shadows T to make it readwrite internally but readonly externally.
+@property(assign, readwrite) id T;
+@end
+
 @implementation MyClass
 @synthesize X = _X;
-@synthesize Y = _Y; // expected-warning{{The '_Y' instance variable was retained by a synthesized property but wasn't released in 'dealloc'}}
-@synthesize Z = _Z; // expected-warning{{The '_Z' instance variable was not retained by a synthesized property but was released in 'dealloc'}}
+@synthesize Y = _Y;
+@synthesize Z = _Z;
 @synthesize K = _K;
+@synthesize L = _L;
 @synthesize N = _N;
 @synthesize M = _M;
+@synthesize Q = _Q;
+@synthesize R = _R;
 @synthesize V = _V;
-@synthesize W = _W; // expected-warning{{The '_W' instance variable was retained by a synthesized property but wasn't released in 'dealloc'}}
+@synthesize W = _W;
 
 -(id) O{ return 0; }
 -(void) setO:(id)arg { }
 
-- (id)dealloc
+
+-(void) releaseInHelper {
+  [_R release]; // no-warning
+  _R = @"Hi";
+}
+
+- (void)dealloc
 {
+
   [_X release];
-  [_Z release];
+  [_Z release]; // expected-warning{{The '_Z' ivar in 'MyClass' was synthesized for an assign, readwrite property but was released in 'dealloc'}}
+  [_T release]; // no-warning
+
+  [_other->_Z release]; // no-warning
   [_N release];
-  
+
   self.M = 0; // This will release '_M'
   [self setV:0]; // This will release '_V'
   [self setW:@"newW"]; // This will release '_W', but retain the new value
-  self.O = 0; // no-warning  
+
+  [_S release]; // expected-warning {{The '_S' ivar in 'MyClass' was synthesized for a weak property but was released in 'dealloc'}}
+
+  self.O = 0; // no-warning
+
+  [_Q release]; // expected-warning {{The '_Q' ivar in 'MyClass' was synthesized for a weak property but was released in 'dealloc'}}
+
+  self.P = 0;
+
+  [self releaseInHelper];
+
+  [_nonPropertyIvar release]; // no-warning
+
+  // Silly, but not an error.
+  if (!_U)
+    [_U release];
+
   [super dealloc];
-  return 0;
+  // expected-warning@-1{{The '_Y' ivar in 'MyClass' was retained by a synthesized property but not released before '[super dealloc]'}}
+  // expected-warning@-2{{The '_W' ivar in 'MyClass' was retained by a synthesized property but not released before '[super dealloc]'}}
+
 }
 
 @end
diff --git a/test/Analysis/analyze_display_progress.c b/test/Analysis/analyze_display_progress.c
deleted file mode 100644
index 958ed009ff10c..0000000000000
--- a/test/Analysis/analyze_display_progress.c
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %clang_cc1 -analyze -analyzer-display-progress %s 2>&1 | FileCheck %s
-
-void f() {};
-void g() {};
-void h() {}
-
-// CHECK: analyze_display_progress.c f
-// CHECK: analyze_display_progress.c g
-// CHECK: analyze_display_progress.c h
-\ No newline at end of file
diff --git a/test/Analysis/analyze_display_progress.cpp b/test/Analysis/analyze_display_progress.cpp
new file mode 100644
index 0000000000000..c84ab63de4820
--- /dev/null
+++ b/test/Analysis/analyze_display_progress.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -analyze -analyzer-display-progress %s 2>&1 | FileCheck %s
+
+void f() {};
+void g() {};
+void h() {}
+
+struct SomeStruct {
+  void f() {}
+};
+
+struct SomeOtherStruct {
+  void f() {}
+};
+
+namespace ns {
+  struct SomeStruct {
+    void f() {}
+  };
+}
+
+// CHECK: analyze_display_progress.cpp f
+// CHECK: analyze_display_progress.cpp g
+// CHECK: analyze_display_progress.cpp h
+// CHECK: analyze_display_progress.cpp SomeStruct::f
+// CHECK: analyze_display_progress.cpp SomeOtherStruct::f
+// CHECK: analyze_display_progress.cpp ns::SomeStruct::f
diff --git a/test/Analysis/atomics.c b/test/Analysis/atomics.c
new file mode 100644
index 0000000000000..f0f5ff0768421
--- /dev/null
+++ b/test/Analysis/atomics.c
@@ -0,0 +1,95 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,debug.ExprInspection -verify %s
+
+// Tests for c11 atomics. Many of these tests currently yield unknown
+// because we don't fully model the atomics and instead imprecisely
+// treat their arguments as escaping.
+
+typedef unsigned int uint32_t;
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+} memory_order;
+
+void clang_analyzer_eval(int);
+
+struct RefCountedStruct {
+  uint32_t refCount;
+  void *ptr;
+};
+
+void test_atomic_fetch_add(struct RefCountedStruct *s) {
+  s->refCount = 1;
+
+  uint32_t result = __c11_atomic_fetch_add((volatile _Atomic(uint32_t) *)&s->refCount,- 1, memory_order_relaxed);
+
+  // When we model atomics fully this should (probably) be FALSE. It should never
+  // be TRUE (because the operation mutates the passed in storage).
+  clang_analyzer_eval(s->refCount == 1); // expected-warning {{UNKNOWN}}
+
+  // When fully modeled this should be TRUE
+  clang_analyzer_eval(result == 1); // expected-warning {{UNKNOWN}}
+}
+
+void test_atomic_load(struct RefCountedStruct *s) {
+  s->refCount = 1;
+
+  uint32_t result = __c11_atomic_load((volatile _Atomic(uint32_t) *)&s->refCount, memory_order_relaxed);
+
+  // When we model atomics fully this should (probably) be TRUE.
+  clang_analyzer_eval(s->refCount == 1); // expected-warning {{UNKNOWN}}
+
+  // When fully modeled this should be TRUE
+  clang_analyzer_eval(result == 1); // expected-warning {{UNKNOWN}}
+}
+
+void test_atomic_store(struct RefCountedStruct *s) {
+  s->refCount = 1;
+
+  __c11_atomic_store((volatile _Atomic(uint32_t) *)&s->refCount, 2, memory_order_relaxed);
+
+  // When we model atomics fully this should (probably) be FALSE. It should never
+  // be TRUE (because the operation mutates the passed in storage).
+  clang_analyzer_eval(s->refCount == 1); // expected-warning {{UNKNOWN}}
+}
+
+void test_atomic_exchange(struct RefCountedStruct *s) {
+  s->refCount = 1;
+
+  uint32_t result = __c11_atomic_exchange((volatile _Atomic(uint32_t) *)&s->refCount, 2, memory_order_relaxed);
+
+  // When we model atomics fully this should (probably) be FALSE. It should never
+  // be TRUE (because the operation mutates the passed in storage).
+  clang_analyzer_eval(s->refCount == 1); // expected-warning {{UNKNOWN}}
+
+  // When fully modeled this should be TRUE
+  clang_analyzer_eval(result == 1); // expected-warning {{UNKNOWN}}
+}
+
+
+void test_atomic_compare_exchange_strong(struct RefCountedStruct *s) {
+  s->refCount = 1;
+  uint32_t expected = 2;
+  uint32_t desired = 3;
+  _Bool result = __c11_atomic_compare_exchange_strong((volatile _Atomic(uint32_t) *)&s->refCount, &expected, desired, memory_order_relaxed, memory_order_relaxed);
+
+  // For now we expect both expected and refCount to be invalidated by the
+  // call. In the future we should model more precisely.
+  clang_analyzer_eval(s->refCount == 3); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(expected == 2); // expected-warning {{UNKNOWN}}
+}
+
+void test_atomic_compare_exchange_weak(struct RefCountedStruct *s) {
+  s->refCount = 1;
+  uint32_t expected = 2;
+  uint32_t desired = 3;
+  _Bool result = __c11_atomic_compare_exchange_weak((volatile _Atomic(uint32_t) *)&s->refCount, &expected, desired, memory_order_relaxed, memory_order_relaxed);
+
+  // For now we expect both expected and refCount to be invalidated by the
+  // call. In the future we should model more precisely.
+  clang_analyzer_eval(s->refCount == 3); // expected-warning {{UNKNOWN}}
+  clang_analyzer_eval(expected == 2); // expected-warning {{UNKNOWN}}
+}
diff --git a/test/Analysis/blocks.m b/test/Analysis/blocks.m
index 4dbe951720738..0b1c15abb3b3e 100644
--- a/test/Analysis/blocks.m
+++ b/test/Analysis/blocks.m
@@ -210,3 +210,25 @@ void testCallContainingWithSignature5()
   });
 }
 
+__attribute__((objc_root_class))
+@interface SuperClass
+- (void)someMethod;
+@end
+
+@interface SomeClass : SuperClass
+@end
+
+// Make sure to properly handle super-calls when a block captures
+// a local variable named 'self'.
+@implementation SomeClass
+-(void)foo; {
+  /*__weak*/ SomeClass *weakSelf = self;
+  (void)(^(void) {
+    SomeClass *self = weakSelf;
+    (void)(^(void) {
+      (void)self;
+      [super someMethod]; // no-warning
+    });
+  });
+}
+@end
diff --git a/test/Analysis/bool-assignment.c b/test/Analysis/bool-assignment.c
index 0f782fbfd9a96..285569ee11bb4 100644
--- a/test/Analysis/bool-assignment.c
+++ b/test/Analysis/bool-assignment.c
@@ -42,6 +42,15 @@ void test_BOOL_initialization(int y) {
     BOOL x = y; // expected-warning {{Assignment of a non-Boolean value}}
     return;
   }
+  if (y > 200 && y < 250) {
+    // FIXME: Currently we are loosing this warning due to a SymbolCast in RHS.
+    BOOL x = y; // no-warning
+    return;
+  }
+  if (y >= 127 && y < 150) {
+    BOOL x = y; // expected-warning{{Assignment of a non-Boolean value}}
+    return;
+  }
   if (y > 1) {
     BOOL x = y; // expected-warning {{Assignment of a non-Boolean value}}
     return;
diff --git a/test/Analysis/bstring.cpp b/test/Analysis/bstring.cpp
new file mode 100644
index 0000000000000..0b4e7e94f00b5
--- /dev/null
+++ b/test/Analysis/bstring.cpp
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,unix.cstring,alpha.unix.cstring,debug.ExprInspection -analyzer-store=region -verify %s
+
+#include "Inputs/system-header-simulator-cxx.h"
+#include "Inputs/system-header-simulator-for-malloc.h"
+
+void clang_analyzer_eval(int);
+
+int *testStdCopyInvalidatesBuffer(std::vector<int> v) {
+  int n = v.size();
+  int *buf = (int *)malloc(n * sizeof(int));
+
+  buf[0] = 66;
+
+  // Call to copy should invalidate buf.
+  std::copy(v.begin(), v.end(), buf);
+
+  int i = buf[0];
+
+  clang_analyzer_eval(i == 66); // expected-warning {{UNKNOWN}}
+
+  return buf;
+}
+
+int *testStdCopyBackwardInvalidatesBuffer(std::vector<int> v) {
+  int n = v.size();
+  int *buf = (int *)malloc(n * sizeof(int));
+  
+  buf[0] = 66;
+
+  // Call to copy_backward should invalidate buf.
+  std::copy_backward(v.begin(), v.end(), buf + n);
+
+  int i = buf[0];
+
+  clang_analyzer_eval(i == 66); // expected-warning {{UNKNOWN}}
+
+  return buf;
+}
diff --git a/test/Analysis/call-invalidation.cpp b/test/Analysis/call-invalidation.cpp
index 7297d1ebec228..80323ffcf18e3 100644
--- a/test/Analysis/call-invalidation.cpp
+++ b/test/Analysis/call-invalidation.cpp
@@ -118,3 +118,50 @@ void testPureConst() {
 }
 
 
+struct PlainStruct {
+  int x, y;
+  mutable int z;
+};
+
+PlainStruct glob;
+
+void useAnything(void *);
+void useAnythingConst(const void *);
+
+void testInvalidationThroughBaseRegionPointer() {
+  PlainStruct s1;
+  s1.x = 1;
+  s1.z = 1;
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(s1.z == 1); // expected-warning{{TRUE}}
+  // Not only passing a structure pointer through const pointer parameter,
+  // but also passing a field pointer through const pointer parameter
+  // should preserve the contents of the structure.
+  useAnythingConst(&(s1.y));
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{TRUE}}
+  // FIXME: Should say "UNKNOWN", because it is not uncommon to
+  // modify a mutable member variable through const pointer.
+  clang_analyzer_eval(s1.z == 1); // expected-warning{{TRUE}}
+  useAnything(&(s1.y));
+  clang_analyzer_eval(s1.x == 1); // expected-warning{{UNKNOWN}}
+}
+
+
+void useFirstConstSecondNonConst(const void *x, void *y);
+void useFirstNonConstSecondConst(void *x, const void *y);
+
+void testMixedConstNonConstCalls() {
+  PlainStruct s2;
+  s2.x = 1;
+  useFirstConstSecondNonConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.x == 1); // expected-warning{{UNKNOWN}}
+  s2.x = 1;
+  useFirstNonConstSecondConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.x == 1); // expected-warning{{UNKNOWN}}
+  s2.y = 1;
+  useFirstConstSecondNonConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
+  s2.y = 1;
+  useFirstNonConstSecondConst(&(s2.x), &(s2.y));
+  clang_analyzer_eval(s2.y == 1); // expected-warning{{UNKNOWN}}
+}
diff --git a/test/Analysis/cxx11-crashes.cpp b/test/Analysis/cxx11-crashes.cpp
index 3c33de33558f9..c6034e6480ee7 100644
--- a/test/Analysis/cxx11-crashes.cpp
+++ b/test/Analysis/cxx11-crashes.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -analyze -analyzer-checker=core -std=c++11 -verify %s
-// expected-no-diagnostics
 
 // radar://11485149, PR12871
 class PlotPoint {
@@ -91,6 +90,6 @@ void test() {
 void fallthrough() {
   switch (1) {
     case 1:
-      [[clang::fallthrough]];
+      [[clang::fallthrough]]; // expected-error {{does not directly precede}}
   }
 }
diff --git a/test/Analysis/dead-stores.c b/test/Analysis/dead-stores.c
index da8e8bdb70344..cddb6c666ad5b 100644
--- a/test/Analysis/dead-stores.c
+++ b/test/Analysis/dead-stores.c
@@ -569,3 +569,7 @@ void testBOComma() {
 
 }
 
+void testVolatile() {
+    volatile int v;
+    v = 0; // no warning
+}
diff --git a/test/Analysis/diagnostics/explicit-suppression.cpp b/test/Analysis/diagnostics/explicit-suppression.cpp
index 78067573e3220..67a47d02b6e95 100644
--- a/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -9,9 +9,15 @@
 
 void clang_analyzer_eval(bool);
 
-void testCopyNull(int *I, int *E) {
-  std::copy(I, E, (int *)0);
+class C {
+  // The virtual function is to make C not trivially copy assignable so that we call the
+  // variant of std::copy() that does not defer to memmove().
+  virtual int f();
+};
+
+void testCopyNull(C *I, C *E) {
+  std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@../Inputs/system-header-simulator-cxx.h:110 {{Dereference of null pointer}}
+  // expected-warning@../Inputs/system-header-simulator-cxx.h:166 {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp b/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp
new file mode 100644
index 0000000000000..143cbbeeac204
--- /dev/null
+++ b/test/Analysis/diagnostics/implicit-cxx-std-suppression.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=false -std=c++11 -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,unix.Malloc,cplusplus.NewDelete,debug.ExprInspection -analyzer-config c++-container-inlining=true -analyzer-config c++-stdlib-inlining=true -std=c++11 -verify %s
+
+// expected-no-diagnostics
+
+#include "../Inputs/system-header-simulator-cxx-std-suppression.h"
+
+void testList_pop_front(std::list<int> list) {
+  while(!list.empty())
+    list.pop_front();  // no-warning
+}
+
+void testBasicStringSuppression() {
+  std::basic_string<uint8_t> v;
+  v.push_back(1); // no-warning
+}
+
+void testBasicStringSuppression_append() {
+  std::basic_string<char32_t> v;
+  v += 'c'; // no-warning
+}
+
+void testBasicStringSuppression_assign(std::basic_string<char32_t> &v,
+                                       const std::basic_string<char32_t> &v2) {
+  v = v2; // no-warning
+}
+
+class MyEngine;
+void testSuppression_independent_bits_engine(MyEngine& e) {
+  std::__independent_bits_engine<MyEngine, unsigned int> x(e, 64); // no-warning
+}
+
+void testSuppression_std_shared_pointer() {
+  std::shared_ptr<int> p(new int(1));
+
+  p = nullptr; // no-warning
+}
diff --git a/test/Analysis/explain-svals.cpp b/test/Analysis/explain-svals.cpp
new file mode 100644
index 0000000000000..c0ed74914ed75
--- /dev/null
+++ b/test/Analysis/explain-svals.cpp
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -triple i386-apple-darwin10 -analyze -analyzer-checker=core.builtin,debug.ExprInspection,unix.cstring -verify %s
+
+typedef unsigned long size_t;
+
+struct S {
+  struct S3 {
+    int y[10];
+  };
+  struct S2 : S3 {
+    int *x;
+  } s2[10];
+  int z;
+};
+
+
+void clang_analyzer_explain(int);
+void clang_analyzer_explain(void *);
+void clang_analyzer_explain(S);
+
+size_t clang_analyzer_getExtent(void *);
+
+size_t strlen(const char *);
+
+int conjure();
+S conjure_S();
+
+int glob;
+static int stat_glob;
+void *glob_ptr;
+
+// Test strings are regex'ed because we need to match exact string
+// rather than a substring.
+
+void test_1(int param, void *ptr) {
+  clang_analyzer_explain(&glob); // expected-warning-re{{{{^pointer to global variable 'glob'$}}}}
+  clang_analyzer_explain(param); // expected-warning-re{{{{^argument 'param'$}}}}
+  clang_analyzer_explain(ptr); // expected-warning-re{{{{^argument 'ptr'$}}}}
+  if (param == 42)
+    clang_analyzer_explain(param); // expected-warning-re{{{{^signed 32-bit integer '42'$}}}}
+}
+
+void test_2(char *ptr, int ext) {
+  clang_analyzer_explain((void *) "asdf"); // expected-warning-re{{{{^pointer to element of type 'char' with index 0 of string literal "asdf"$}}}}
+  clang_analyzer_explain(strlen(ptr)); // expected-warning-re{{{{^metadata of type 'unsigned long' tied to pointee of argument 'ptr'$}}}}
+  clang_analyzer_explain(conjure()); // expected-warning-re{{{{^symbol of type 'int' conjured at statement 'conjure\(\)'$}}}}
+  clang_analyzer_explain(glob); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at statement 'conjure\(\)'\) for global variable 'glob'$}}}}
+  clang_analyzer_explain(glob_ptr); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at statement 'conjure\(\)'\) for global variable 'glob_ptr'$}}}}
+  clang_analyzer_explain(clang_analyzer_getExtent(ptr)); // expected-warning-re{{{{^extent of pointee of argument 'ptr'$}}}}
+  int *x = new int[ext];
+  clang_analyzer_explain(x); // expected-warning-re{{{{^pointer to element of type 'int' with index 0 of pointee of symbol of type 'int \*' conjured at statement 'new int \[ext\]'$}}}}
+  // Sic! What gets computed is the extent of the element-region.
+  clang_analyzer_explain(clang_analyzer_getExtent(x)); // expected-warning-re{{{{^signed 32-bit integer '4'$}}}}
+  delete[] x;
+}
+
+void test_3(S s) {
+  clang_analyzer_explain(&s); // expected-warning-re{{{{^pointer to parameter 's'$}}}}
+  clang_analyzer_explain(s.z); // expected-warning-re{{{{^initial value of field 'z' of parameter 's'$}}}}
+  clang_analyzer_explain(&s.s2[5].y[3]); // expected-warning-re{{{{^pointer to element of type 'int' with index 3 of field 'y' of base object 'S::S3' inside element of type 'struct S::S2' with index 5 of field 's2' of parameter 's'$}}}}
+  if (!s.s2[7].x) {
+    clang_analyzer_explain(s.s2[7].x); // expected-warning-re{{{{^concrete memory address '0'$}}}}
+    // FIXME: we need to be explaining '1' rather than '0' here; not explainer bug.
+    clang_analyzer_explain(s.s2[7].x + 1); // expected-warning-re{{{{^concrete memory address '0'$}}}}
+  }
+}
+
+void test_4(int x, int y) {
+  int z;
+  static int stat;
+  clang_analyzer_explain(x + 1); // expected-warning-re{{{{^\(argument 'x'\) \+ 1$}}}}
+  clang_analyzer_explain(1 + y); // expected-warning-re{{{{^\(argument 'y'\) \+ 1$}}}}
+  clang_analyzer_explain(x + y); // expected-warning-re{{{{^unknown value$}}}}
+  clang_analyzer_explain(z); // expected-warning-re{{{{^undefined value$}}}}
+  clang_analyzer_explain(&z); // expected-warning-re{{{{^pointer to local variable 'z'$}}}}
+  clang_analyzer_explain(stat); // expected-warning-re{{{{^signed 32-bit integer '0'$}}}}
+  clang_analyzer_explain(&stat); // expected-warning-re{{{{^pointer to static local variable 'stat'$}}}}
+  clang_analyzer_explain(stat_glob); // expected-warning-re{{{{^initial value of global variable 'stat_glob'$}}}}
+  clang_analyzer_explain(&stat_glob); // expected-warning-re{{{{^pointer to global variable 'stat_glob'$}}}}
+  clang_analyzer_explain((int[]){1, 2, 3}); // expected-warning-re{{{{^pointer to element of type 'int' with index 0 of compound literal \(int \[3\]\)\{1, 2, 3\}$}}}}
+}
+
+namespace {
+class C {
+  int x[10];
+
+public:
+  void test_5(int i) {
+    clang_analyzer_explain(this); // expected-warning-re{{{{^pointer to 'this' object$}}}}
+    clang_analyzer_explain(&x[i]); // expected-warning-re{{{{^pointer to element of type 'int' with index 'argument 'i'' of field 'x' of 'this' object$}}}}
+    clang_analyzer_explain(__builtin_alloca(i)); // expected-warning-re{{{{^pointer to region allocated by '__builtin_alloca\(i\)'$}}}}
+  }
+};
+} // end of anonymous namespace
+
+void test_6() {
+  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of temporary object constructed at statement 'conjure_S\(\)'$}}}}
+  clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'struct S' conjured at statement 'conjure_S\(\)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}}
+}
diff --git a/test/Analysis/fields.c b/test/Analysis/fields.c
index 863a21aaf61ea..a670c50434e31 100644
--- a/test/Analysis/fields.c
+++ b/test/Analysis/fields.c
@@ -16,7 +16,7 @@ struct s {
 
 void f() {
   struct s a;
-  int *p = &(a.n) + 1;
+  int *p = &(a.n) + 1; // expected-warning{{Pointer arithmetic on}}
 }
 
 typedef struct {
diff --git a/test/Analysis/generics.m b/test/Analysis/generics.m
index b64d06935d9a7..490ac5dc8ad07 100644
--- a/test/Analysis/generics.m
+++ b/test/Analysis/generics.m
@@ -328,6 +328,21 @@ void returnToIdVariable(NSArray<NSString *> *arr) {
   NSNumber *res = a; // expected-warning {{Object has a dynamic type 'NSString *' which is incompatible with static type 'NSNumber *'}}
 }
 
+@interface UnrelatedTypeGeneric<T> : NSObject<NSCopying>
+- (void)takesType:(T)v;
+@end
+
+void testGetMostInformativeDerivedForId(NSArray<NSString *> *a,
+                                  UnrelatedTypeGeneric<NSString *> *b) {
+  id idB = b;
+  a = idB; // expected-warning {{Conversion from value of type 'UnrelatedTypeGeneric<NSString *> *' to incompatible type 'NSArray<NSString *> *'}}
+
+  // rdar://problem/26086914 crash here caused by symbolic type being unrelated
+  // to compile-time source type of cast.
+  id x = a; // Compile-time type is NSArray<>, Symbolic type is UnrelatedTypeGeneric<>.
+  [x takesType:[[NSNumber alloc] init]]; // expected-warning {{Conversion from value of type 'NSNumber *' to incompatible type 'NSString *'}}
+}
+
 // CHECK: <key>diagnostics</key>
 // CHECK-NEXT: <array>
 // CHECK-NEXT:  <dict>
@@ -6626,4 +6641,262 @@ void returnToIdVariable(NSArray<NSString *> *arr) {
 // CHECK-NEXT:   <key>file</key><integer>0</integer>
 // CHECK-NEXT:  </dict>
 // CHECK-NEXT:  </dict>
+// CHECK-NEXT:  <dict>
+// CHECK-NEXT:   <key>path</key>
+// CHECK-NEXT:   <array>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>event</string>
+// CHECK-NEXT:     <key>location</key>
+// CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>line</key><integer>337</integer>
+// CHECK-NEXT:      <key>col</key><integer>12</integer>
+// CHECK-NEXT:      <key>file</key><integer>0</integer>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <key>ranges</key>
+// CHECK-NEXT:     <array>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>337</integer>
+// CHECK-NEXT:         <key>col</key><integer>12</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>337</integer>
+// CHECK-NEXT:         <key>col</key><integer>12</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </array>
+// CHECK-NEXT:     <key>depth</key><integer>0</integer>
+// CHECK-NEXT:     <key>extended_message</key>
+// CHECK-NEXT:     <string>Type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; is inferred from implicit cast (from &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to &apos;id&apos;)</string>
+// CHECK-NEXT:     <key>message</key>
+// CHECK-NEXT:     <string>Type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; is inferred from implicit cast (from &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to &apos;id&apos;)</string>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>control</string>
+// CHECK-NEXT:     <key>edges</key>
+// CHECK-NEXT:      <array>
+// CHECK-NEXT:       <dict>
+// CHECK-NEXT:        <key>start</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>337</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>337</integer>
+// CHECK-NEXT:           <key>col</key><integer>4</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:        <key>end</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:       </dict>
+// CHECK-NEXT:      </array>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>control</string>
+// CHECK-NEXT:     <key>edges</key>
+// CHECK-NEXT:      <array>
+// CHECK-NEXT:       <dict>
+// CHECK-NEXT:        <key>start</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:        <key>end</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>7</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>338</integer>
+// CHECK-NEXT:           <key>col</key><integer>9</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:       </dict>
+// CHECK-NEXT:      </array>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>event</string>
+// CHECK-NEXT:     <key>location</key>
+// CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>line</key><integer>338</integer>
+// CHECK-NEXT:      <key>col</key><integer>7</integer>
+// CHECK-NEXT:      <key>file</key><integer>0</integer>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <key>ranges</key>
+// CHECK-NEXT:     <array>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>338</integer>
+// CHECK-NEXT:         <key>col</key><integer>7</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>338</integer>
+// CHECK-NEXT:         <key>col</key><integer>9</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </array>
+// CHECK-NEXT:     <key>depth</key><integer>0</integer>
+// CHECK-NEXT:     <key>extended_message</key>
+// CHECK-NEXT:     <string>Conversion from value of type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to incompatible type &apos;NSArray&lt;NSString *&gt; *&apos;</string>
+// CHECK-NEXT:     <key>message</key>
+// CHECK-NEXT:     <string>Conversion from value of type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to incompatible type &apos;NSArray&lt;NSString *&gt; *&apos;</string>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:   </array>
+// CHECK-NEXT:   <key>description</key><string>Conversion from value of type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to incompatible type &apos;NSArray&lt;NSString *&gt; *&apos;</string>
+// CHECK-NEXT:   <key>category</key><string>Core Foundation/Objective-C</string>
+// CHECK-NEXT:   <key>type</key><string>Generics</string>
+// CHECK-NEXT:   <key>check_name</key><string>core.DynamicTypePropagation</string>
+// CHECK-NEXT:   <!-- This hash is experimental and going to change! -->
+// CHECK-NEXT:   <key>issue_hash_content_of_line_in_context</key><string>8347f65fb51a85ccd462d75ffd761078</string>
+// CHECK-NEXT:  <key>issue_context_kind</key><string>function</string>
+// CHECK-NEXT:  <key>issue_context</key><string>testGetMostInformativeDerivedForId</string>
+// CHECK-NEXT:  <key>issue_hash_function_offset</key><string>2</string>
+// CHECK-NEXT:  <key>location</key>
+// CHECK-NEXT:  <dict>
+// CHECK-NEXT:   <key>line</key><integer>338</integer>
+// CHECK-NEXT:   <key>col</key><integer>7</integer>
+// CHECK-NEXT:   <key>file</key><integer>0</integer>
+// CHECK-NEXT:  </dict>
+// CHECK-NEXT:  </dict>
+// CHECK-NEXT:  <dict>
+// CHECK-NEXT:   <key>path</key>
+// CHECK-NEXT:   <array>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>event</string>
+// CHECK-NEXT:     <key>location</key>
+// CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>line</key><integer>337</integer>
+// CHECK-NEXT:      <key>col</key><integer>12</integer>
+// CHECK-NEXT:      <key>file</key><integer>0</integer>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <key>ranges</key>
+// CHECK-NEXT:     <array>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>337</integer>
+// CHECK-NEXT:         <key>col</key><integer>12</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>337</integer>
+// CHECK-NEXT:         <key>col</key><integer>12</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </array>
+// CHECK-NEXT:     <key>depth</key><integer>0</integer>
+// CHECK-NEXT:     <key>extended_message</key>
+// CHECK-NEXT:     <string>Type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; is inferred from implicit cast (from &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to &apos;id&apos;)</string>
+// CHECK-NEXT:     <key>message</key>
+// CHECK-NEXT:     <string>Type &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; is inferred from implicit cast (from &apos;UnrelatedTypeGeneric&lt;NSString *&gt; *&apos; to &apos;id&apos;)</string>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>control</string>
+// CHECK-NEXT:     <key>edges</key>
+// CHECK-NEXT:      <array>
+// CHECK-NEXT:       <dict>
+// CHECK-NEXT:        <key>start</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>337</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>337</integer>
+// CHECK-NEXT:           <key>col</key><integer>4</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:        <key>end</key>
+// CHECK-NEXT:         <array>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>343</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:          <dict>
+// CHECK-NEXT:           <key>line</key><integer>343</integer>
+// CHECK-NEXT:           <key>col</key><integer>3</integer>
+// CHECK-NEXT:           <key>file</key><integer>0</integer>
+// CHECK-NEXT:          </dict>
+// CHECK-NEXT:         </array>
+// CHECK-NEXT:       </dict>
+// CHECK-NEXT:      </array>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:    <dict>
+// CHECK-NEXT:     <key>kind</key><string>event</string>
+// CHECK-NEXT:     <key>location</key>
+// CHECK-NEXT:     <dict>
+// CHECK-NEXT:      <key>line</key><integer>343</integer>
+// CHECK-NEXT:      <key>col</key><integer>3</integer>
+// CHECK-NEXT:      <key>file</key><integer>0</integer>
+// CHECK-NEXT:     </dict>
+// CHECK-NEXT:     <key>ranges</key>
+// CHECK-NEXT:     <array>
+// CHECK-NEXT:       <array>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>343</integer>
+// CHECK-NEXT:         <key>col</key><integer>16</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:        <dict>
+// CHECK-NEXT:         <key>line</key><integer>343</integer>
+// CHECK-NEXT:         <key>col</key><integer>38</integer>
+// CHECK-NEXT:         <key>file</key><integer>0</integer>
+// CHECK-NEXT:        </dict>
+// CHECK-NEXT:       </array>
+// CHECK-NEXT:     </array>
+// CHECK-NEXT:     <key>depth</key><integer>0</integer>
+// CHECK-NEXT:     <key>extended_message</key>
+// CHECK-NEXT:     <string>Conversion from value of type &apos;NSNumber *&apos; to incompatible type &apos;NSString *&apos;</string>
+// CHECK-NEXT:     <key>message</key>
+// CHECK-NEXT:     <string>Conversion from value of type &apos;NSNumber *&apos; to incompatible type &apos;NSString *&apos;</string>
+// CHECK-NEXT:    </dict>
+// CHECK-NEXT:   </array>
+// CHECK-NEXT:   <key>description</key><string>Conversion from value of type &apos;NSNumber *&apos; to incompatible type &apos;NSString *&apos;</string>
+// CHECK-NEXT:   <key>category</key><string>Core Foundation/Objective-C</string>
+// CHECK-NEXT:   <key>type</key><string>Generics</string>
+// CHECK-NEXT:   <key>check_name</key><string>core.DynamicTypePropagation</string>
+// CHECK-NEXT:   <!-- This hash is experimental and going to change! -->
+// CHECK-NEXT:   <key>issue_hash_content_of_line_in_context</key><string>6528db66f562ac0c2a94933f3ca5f6a8</string>
+// CHECK-NEXT:  <key>issue_context_kind</key><string>function</string>
+// CHECK-NEXT:  <key>issue_context</key><string>testGetMostInformativeDerivedForId</string>
+// CHECK-NEXT:  <key>issue_hash_function_offset</key><string>7</string>
+// CHECK-NEXT:  <key>location</key>
+// CHECK-NEXT:  <dict>
+// CHECK-NEXT:   <key>line</key><integer>343</integer>
+// CHECK-NEXT:   <key>col</key><integer>3</integer>
+// CHECK-NEXT:   <key>file</key><integer>0</integer>
+// CHECK-NEXT:  </dict>
+// CHECK-NEXT:  </dict>
 // CHECK-NEXT: </array>
diff --git a/test/Analysis/index-type.c b/test/Analysis/index-type.c
new file mode 100644
index 0000000000000..fc638dfe74174
--- /dev/null
+++ b/test/Analysis/index-type.c
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -analyze -analyzer-checker=core,alpha.security.ArrayBoundV2 -verify %s
+// RUN: %clang_cc1 -triple i386-apple-darwin10 -analyze -analyzer-checker=core,alpha.security.ArrayBoundV2 -DM32 -verify %s
+// expected-no-diagnostics
+
+#define UINT_MAX (~0u)
+
+#ifdef M32
+
+#define X86_ARRAY_SIZE (UINT_MAX/2 + 4)
+
+void testIndexTooBig() {
+  char arr[X86_ARRAY_SIZE];
+  char *ptr = arr + UINT_MAX/2;
+  ptr += 2;  // index shouldn't overflow
+  *ptr = 42; // no-warning
+}
+
+#else // 64-bit tests
+
+#define ARRAY_SIZE 0x100000000
+
+void testIndexOverflow64() {
+  char arr[ARRAY_SIZE];
+  char *ptr = arr + UINT_MAX/2;
+  ptr += 2;  // don't overflow 64-bit index
+  *ptr = 42; // no-warning
+}
+
+#define ULONG_MAX (~0ul)
+#define BIG_INDEX (ULONG_MAX/16)
+
+void testIndexTooBig64() {
+  char arr[ULONG_MAX/8-1];
+  char *ptr = arr + BIG_INDEX;
+  ptr += 2;  // don't overflow 64-bit index
+  *ptr = 42; // no-warning
+}
+
+#endif
diff --git a/test/Analysis/initializers-cfg-output.cpp b/test/Analysis/initializers-cfg-output.cpp
index db3c0fb99000f..deefbef07738a 100644
--- a/test/Analysis/initializers-cfg-output.cpp
+++ b/test/Analysis/initializers-cfg-output.cpp
@@ -61,7 +61,7 @@ class TestDelegating {
 // CHECK:    6: B([B1.5]) (Base initializer)
 // CHECK:    7:  (CXXConstructExpr, class A)
 // CHECK:    8: A([B1.7]) (Base initializer)
-// CHECK:    9: /*implicit*/int()
+// CHECK:    9: /*implicit*/(int)0
 // CHECK:   10: i([B1.9]) (Member initializer)
 // CHECK:   11: this
 // CHECK:   12: [B1.11]->i
diff --git a/test/Analysis/inlining/false-positive-suppression.c b/test/Analysis/inlining/false-positive-suppression.c
index e1c8f67614e11..a0bc3611fe97c 100644
--- a/test/Analysis/inlining/false-positive-suppression.c
+++ b/test/Analysis/inlining/false-positive-suppression.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core -analyzer-config suppress-null-return-paths=false -verify %s
-// RUN: %clang_cc1 -analyze -analyzer-checker=core -verify -DSUPPRESSED=1 %s
-// RUN: %clang_cc1 -analyze -analyzer-checker=core -analyzer-config avoid-suppressing-null-argument-paths=true -DSUPPRESSED=1 -DNULL_ARGS=1 -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-eagerly-assume -analyzer-checker=core -analyzer-config suppress-null-return-paths=false -verify %s
+// RUN: %clang_cc1 -analyze -analyzer-eagerly-assume -analyzer-checker=core -verify -DSUPPRESSED=1 %s
+// RUN: %clang_cc1 -analyze -analyzer-eagerly-assume -analyzer-checker=core -analyzer-config avoid-suppressing-null-argument-paths=true -DSUPPRESSED=1 -DNULL_ARGS=1 -verify %s
 
 int opaquePropertyCheck(void *object);
 int coin();
@@ -93,6 +93,84 @@ int triggerDivZero () {
   return 5/y; // expected-warning {{Division by zero}}
 }
 
+// Treat a function-like macro similarly to an inlined function, so suppress
+// warnings along paths resulting from inlined checks.
+#define MACRO_WITH_CHECK(a) ( ((a) != 0) ? *a : 17)
+void testInlineCheckInMacro(int *p) {
+  int i = MACRO_WITH_CHECK(p);
+  (void)i;
+
+  *p = 1; // no-warning
+}
+
+#define MACRO_WITH_NESTED_CHECK(a) ( { int j = MACRO_WITH_CHECK(a); j; } )
+void testInlineCheckInNestedMacro(int *p) {
+  int i = MACRO_WITH_NESTED_CHECK(p);
+  (void)i;
+
+  *p = 1; // no-warning
+}
+
+// If there is a check in a macro that is not function-like, don't treat
+// it like a function so don't suppress.
+#define NON_FUNCTION_MACRO_WITH_CHECK ( ((p) != 0) ? *p : 17)
+void testNonFunctionMacro(int *p) {
+  int i = NON_FUNCTION_MACRO_WITH_CHECK ;
+  (void)i;
+
+  *p = 1; // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+}
+
+
+// This macro will dereference its argument if the argument is NULL.
+#define MACRO_WITH_ERROR(a) ( ((a) != 0) ? 0 : *a)
+void testErrorInMacro(int *p) {
+  int i = MACRO_WITH_ERROR(p); // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+  (void)i;
+}
+
+// Here the check (the "if") is not in a macro, so we should still warn.
+#define MACRO_IN_GUARD(a) (!(a))
+void testMacroUsedAsGuard(int *p) {
+  if (MACRO_IN_GUARD(p))
+    *p = 1; // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}
+}
+
+// When a nil case split is introduced in a macro and the macro is in a guard,
+// we still shouldn't warn.
+int isNull(int *p);
+int isEqual(int *p, int *q);
+#define ISNULL(ptr)    ((ptr) == 0 || isNull(ptr))
+#define ISEQUAL(a, b)    ((int *)(a) == (int *)(b) || (ISNULL(a) && ISNULL(b)) || isEqual(a,b))
+#define ISNOTEQUAL(a, b)   (!ISEQUAL(a, b))
+void testNestedDisjunctiveMacro(int *p, int *q) {
+  if (ISNOTEQUAL(p,q)) {
+    *p = 1; // no-warning
+    *q = 1; // no-warning
+  }
+
+  *p = 1; // no-warning
+  *q = 1; // no-warning
+}
+
+void testNestedDisjunctiveMacro2(int *p, int *q) {
+  if (ISEQUAL(p,q)) {
+    return;
+  }
+
+  *p = 1; // no-warning
+  *q = 1; // no-warning
+}
+
+
+// Here the check is entirely in non-macro code even though the code itself
+// is a macro argument.
+#define MACRO_DO_IT(a) (a)
+void testErrorInArgument(int *p) {
+  int i = MACRO_DO_IT((p ? 0 : *p)); // expected-warning {{Dereference of null pointer (loaded from variable 'p')}}c
+  (void)i;
+}
+
 // --------------------------
 // "Suppression suppression"
 // --------------------------
diff --git a/test/Analysis/inlining/false-positive-suppression.m b/test/Analysis/inlining/false-positive-suppression.m
index 53ec138367ebc..1a5ff662c18ea 100644
--- a/test/Analysis/inlining/false-positive-suppression.m
+++ b/test/Analysis/inlining/false-positive-suppression.m
@@ -36,3 +36,80 @@ void testNilReceiver(int coin) {
   else
     testNilReceiverHelperB([[x getObject] getPtr]);
 }
+
+// FALSE NEGATIVES (over-suppression)
+
+__attribute__((objc_root_class))
+@interface SomeClass
+-(int *)methodReturningNull;
+
+@property(readonly) int *propertyReturningNull;
+
+@property(readonly) int *synthesizedProperty;
+
+@end
+
+@interface SubOfSomeClass : SomeClass
+@end
+
+@implementation SubOfSomeClass
+@end
+
+@implementation SomeClass
+-(int *)methodReturningNull {
+  return 0;
+}
+
+-(int *)propertyReturningNull {
+  return 0;
+}
+
++(int *)classPropertyReturningNull {
+  return 0;
+}
+@end
+
+void testMethodReturningNull(SomeClass *sc) {
+  int *result = [sc methodReturningNull];
+  *result = 1;
+#ifndef SUPPRESSED
+  // expected-warning@-2 {{Dereference of null pointer}}
+#endif
+}
+
+void testPropertyReturningNull(SomeClass *sc) {
+  int *result = sc.propertyReturningNull;
+  *result = 1;
+#ifndef SUPPRESSED
+  // expected-warning@-2 {{Dereference of null pointer}}
+#endif
+}
+
+@implementation SubOfSomeClass (ForTestOfSuperProperty)
+-(void)testSuperPropertyReturningNull {
+  int *result = super.propertyReturningNull;
+  *result = 1;
+#ifndef SUPPRESSED
+  // expected-warning@-2 {{Dereference of null pointer}}
+#endif
+}
+@end
+
+void testClassPropertyReturningNull() {
+  int *result = SomeClass.classPropertyReturningNull;
+  *result = 1;
+#ifndef SUPPRESSED
+  // expected-warning@-2 {{Dereference of null pointer}}
+#endif
+}
+
+void testSynthesizedPropertyReturningNull(SomeClass *sc) {
+  if (sc.synthesizedProperty)
+    return;
+
+  int *result = sc.synthesizedProperty;
+  *result = 1;
+#ifndef SUPPRESSED
+  // expected-warning@-2 {{Dereference of null pointer}}
+#endif
+}
diff --git a/test/Analysis/inlining/stl.cpp b/test/Analysis/inlining/stl.cpp
index 2a8520f767185..95ac3f8172dbd 100644
--- a/test/Analysis/inlining/stl.cpp
+++ b/test/Analysis/inlining/stl.cpp
@@ -27,28 +27,3 @@ void testException(std::exception e) {
   // expected-warning@-4 {{UNKNOWN}}
 #endif
 }
-
-void testList_pop_front(std::list<int> list) {
-  while(!list.empty())
-    list.pop_front();  // no-warning
-}
-
-void testBasicStringSuppression() {
-  std::basic_string<uint8_t> v;
-  v.push_back(1); // no-warning
-}
-
-void testBasicStringSuppression_append() {
-  std::basic_string<char32_t> v;
-  v += 'c'; // no-warning
-}
-
-void testBasicStringSuppression_assign(std::basic_string<char32_t> &v,
-                                       const std::basic_string<char32_t> &v2) {
-  v = v2;
-}
-
-class MyEngine;
-void testSupprerssion_independent_bits_engine(MyEngine& e) {
-  std::__independent_bits_engine<MyEngine, unsigned int> x(e, 64); // no-warning
-}
diff --git a/test/Analysis/lambdas.mm b/test/Analysis/lambdas.mm
index 6247f28870fb8..dc1a13e8b69dc 100644
--- a/test/Analysis/lambdas.mm
+++ b/test/Analysis/lambdas.mm
@@ -12,7 +12,6 @@ int clang_analyzer_eval(int);
 }
 @end
 
-
 @implementation Sub
 - (void)callMethodOnSuperInCXXLambda; {
   // Explicit capture.
@@ -26,6 +25,20 @@ int clang_analyzer_eval(int);
   }();
 }
 
+// Make sure to properly handle super-calls when a block captures
+// a local variable named 'self'.
+- (void)callMethodOnSuperInCXXLambdaWithRedefinedSelf; {
+  /*__weak*/ Sub *weakSelf = self;
+  // Implicit capture. (Sema outlaws explicit capture of a redefined self
+  // and a call to super [which uses the original self]).
+  [=]() {
+    Sub *self = weakSelf;
+    [=]() {
+      [super superMethod];
+    }();
+  }();
+}
+
 - (void)swapIvars {
   int tmp = _ivar1;
   _ivar1 = _ivar2;
diff --git a/test/Analysis/localization.m b/test/Analysis/localization.m
index ce55609b1bf39..dc80705428cfd 100644
--- a/test/Analysis/localization.m
+++ b/test/Analysis/localization.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -analyze -fblocks -analyzer-store=region -analyzer-checker=optin.osx.cocoa.localizability.NonLocalizedStringChecker -analyzer-checker=alpha.osx.cocoa.localizability.PluralMisuseChecker -verify  %s
+// RUN: %clang_cc1 -analyze -fblocks -analyzer-store=region -analyzer-output=text -analyzer-checker=optin.osx.cocoa.localizability.NonLocalizedStringChecker -analyzer-checker=alpha.osx.cocoa.localizability.PluralMisuseChecker -verify  %s
 
 // The larger set of tests in located in localization.m. These are tests
 // specific for non-aggressive reporting.
@@ -61,11 +61,26 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
   UILabel *testLabel = [[UILabel alloc] init];
   NSString *bar = NSLocalizedString(@"Hello", @"Comment");
 
-  if (random()) {
-    bar = @"Unlocalized string";
+  if (random()) { // expected-note {{Taking true branch}}
+    bar = @"Unlocalized string"; // expected-note {{Non-localized string literal here}}
   }
 
-  [testLabel setText:bar]; // expected-warning {{User-facing text should use localized string macro}}
+  [testLabel setText:bar]; // expected-warning {{User-facing text should use localized string macro}} expected-note {{User-facing}}
+}
+
+- (void)testMultipleUnlocalizedStringsInSamePath {
+  UILabel *testLabel = [[UILabel alloc] init];
+  NSString *bar = @"Unlocalized string"; // no-note
+
+  bar = @"Unlocalized string"; // expected-note {{Non-localized string literal here}}
+
+  NSString *other = @"Other unlocalized string."; // no-note
+  (void)other;
+
+  NSString *same = @"Unlocalized string"; // no-note
+  (void)same;
+
+  [testLabel setText:bar]; // expected-warning {{User-facing text should use localized string macro}} expected-note {{User-facing}}
 }
 
 - (void)testOneCharacterStringsDoNotGiveAWarning {
@@ -90,20 +105,27 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
   [testLabel setText:bar]; // no-warning
 }
 
+
+// Suppress diagnostic about user-facing string constants when the method name
+// contains the term "Debug".
+- (void)debugScreen:(UILabel *)label {
+  label.text = @"Unlocalized";
+}
+
 // Plural Misuse Checker Tests
 // These tests are modeled off incorrect uses of the many-one pattern
 // from real projects. 
 
 - (NSString *)test1:(int)plural {
     if (plural) {
-        return MCLocalizedString(@"TYPE_PLURAL"); // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return MCLocalizedString(@"TYPE_PLURAL"); // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     }
     return MCLocalizedString(@"TYPE");
 }
 
 - (NSString *)test2:(int)numOfReminders {
     if (numOfReminders > 0) {
-        return [NSString stringWithFormat:@"%@, %@", @"Test", (numOfReminders != 1) ? [NSString stringWithFormat:NSLocalizedString(@"%@ Reminders", @"Plural count of reminders"), numOfReminders] : [NSString stringWithFormat:NSLocalizedString(@"1 reminder", @"One reminder")]]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return [NSString stringWithFormat:@"%@, %@", @"Test", (numOfReminders != 1) ? [NSString stringWithFormat:NSLocalizedString(@"%@ Reminders", @"Plural count of reminders"), numOfReminders] : [NSString stringWithFormat:NSLocalizedString(@"1 reminder", @"One reminder")]]; // expected-warning 2 {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note 2 {{Plural}}
     } 
     return nil;
 }
@@ -112,18 +134,18 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
     NSString *count;
     if (self.unreadArticlesCount > 1)
     {
-        count = [count stringByAppendingFormat:@"%@", KHLocalizedString(@"New Stories", @"Plural count for new stories")]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        count = [count stringByAppendingFormat:@"%@", KHLocalizedString(@"New Stories", @"Plural count for new stories")]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     } else {
-        count = [count stringByAppendingFormat:@"%@",  KHLocalizedString(@"New Story", @"One new story")]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        count = [count stringByAppendingFormat:@"%@",  KHLocalizedString(@"New Story", @"One new story")]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     }
 }
 
 - (NSString *)test4:(int)count {
     if ( count == 1 )
     {
-        return [NSString stringWithFormat:KHLocalizedString(@"value.singular",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return [NSString stringWithFormat:KHLocalizedString(@"value.singular",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     } else {
-        return [NSString stringWithFormat:KHLocalizedString(@"value.plural",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return [NSString stringWithFormat:KHLocalizedString(@"value.plural",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     }
 }
 
@@ -131,9 +153,9 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
 	int test = count == 1;
     if (test)
     {
-        return [NSString stringWithFormat:KHLocalizedString(@"value.singular",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return [NSString stringWithFormat:KHLocalizedString(@"value.singular",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     } else {
-        return [NSString stringWithFormat:KHLocalizedString(@"value.plural",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return [NSString stringWithFormat:KHLocalizedString(@"value.plural",nil), count]; // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     }
 }
 
@@ -147,7 +169,7 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
 		if (someOtherVariable)
         	return KHLocalizedString(@"OK",nil); // no-warning
     } else {
-        return KHLocalizedString(@"value.plural",nil); // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}}
+        return KHLocalizedString(@"value.plural",nil); // expected-warning {{Plural cases are not supported accross all languages. Use a .stringsdict file}} expected-note {{Plural}}
     }
 	return nil;
 }
@@ -205,3 +227,15 @@ NSString *KHLocalizedString(NSString* key, NSString* comment) {
 // }
 
 @end
+
+
+// Suppress diagnostic about user-facing string constants when the class name
+// contains "Debug"
+@interface MyDebugView : NSObject
+@end
+
+@implementation MyDebugView
+- (void)setupScreen:(UILabel *)label {
+  label.text = @"Unlocalized"; // no-warning
+}
+@end
diff --git a/test/Analysis/malloc.c b/test/Analysis/malloc.c
index 881eb38ad8401..51e2cd604327f 100644
--- a/test/Analysis/malloc.c
+++ b/test/Analysis/malloc.c
@@ -4,6 +4,21 @@
 
 void clang_analyzer_eval(int);
 
+// Without -fms-compatibility, wchar_t isn't a builtin type. MSVC defines
+// _WCHAR_T_DEFINED if wchar_t is available. Microsoft recommends that you use
+// the builtin type: "Using the typedef version can cause portability
+// problems", but we're ok here because we're not actually running anything.
+// Also of note is this cryptic warning: "The wchar_t type is not supported
+// when you compile C code".
+//
+// See the docs for more:
+// https://msdn.microsoft.com/en-us/library/dh8che7s.aspx
+#if !defined(_WCHAR_T_DEFINED)
+// "Microsoft implements wchar_t as a two-byte unsigned value"
+typedef unsigned short wchar_t;
+#define _WCHAR_T_DEFINED
+#endif // !defined(_WCHAR_T_DEFINED)
+
 typedef __typeof(sizeof(int)) size_t;
 void *malloc(size_t);
 void *alloca(size_t);
@@ -13,9 +28,15 @@ void *realloc(void *ptr, size_t size);
 void *reallocf(void *ptr, size_t size);
 void *calloc(size_t nmemb, size_t size);
 char *strdup(const char *s);
+wchar_t *wcsdup(const wchar_t *s);
 char *strndup(const char *s, size_t n);
 int memcmp(const void *s1, const void *s2, size_t n);
 
+// Windows variants
+char *_strdup(const char *strSource);
+wchar_t *_wcsdup(const wchar_t *strSource);
+void *_alloca(size_t size);
+
 void myfoo(int *p);
 void myfooint(int p);
 char *fooRetPtr();
@@ -55,6 +76,10 @@ void allocaTest() {
   int *p = alloca(sizeof(int));
 } // no warn
 
+void winAllocaTest() {
+  int *p = _alloca(sizeof(int));
+} // no warn
+
 void allocaBuiltinTest() {
   int *p = __builtin_alloca(sizeof(int));
 } // no warn
@@ -210,6 +235,11 @@ void CheckUseZeroAllocatedNoWarn2() {
   int *p = alloca(0); // no warning
 }
 
+void CheckUseZeroWinAllocatedNoWarn2() {
+  int *p = _alloca(0); // no warning
+}
+
+
 void CheckUseZeroAllocatedNoWarn3() {
   int *p = malloc(0);
   int *q = realloc(p, 8); // no warning
@@ -233,6 +263,11 @@ char CheckUseZeroAllocated2() {
   return *p; // expected-warning {{Use of zero-allocated memory}}
 }
 
+char CheckUseZeroWinAllocated2() {
+  char *p = _alloca(0);
+  return *p; // expected-warning {{Use of zero-allocated memory}}
+}
+
 void UseZeroAllocated(int *p) {
   if (p)
     *p = 7; // expected-warning {{Use of zero-allocated memory}}
@@ -1076,6 +1111,21 @@ void testStrdup(const char *s, unsigned validIndex) {
   s2[validIndex + 1] = 'b';
 } // expected-warning {{Potential leak of memory pointed to by}}
 
+void testWinStrdup(const char *s, unsigned validIndex) {
+  char *s2 = _strdup(s);
+  s2[validIndex + 1] = 'b';
+} // expected-warning {{Potential leak of memory pointed to by}}
+
+void testWcsdup(const wchar_t *s, unsigned validIndex) {
+  wchar_t *s2 = wcsdup(s);
+  s2[validIndex + 1] = 'b';
+} // expected-warning {{Potential leak of memory pointed to by}}
+
+void testWinWcsdup(const wchar_t *s, unsigned validIndex) {
+  wchar_t *s2 = _wcsdup(s);
+  s2[validIndex + 1] = 'b';
+} // expected-warning {{Potential leak of memory pointed to by}}
+
 int testStrndup(const char *s, unsigned validIndex, unsigned size) {
   char *s2 = strndup(s, size);
   s2 [validIndex + 1] = 'b';
@@ -1091,6 +1141,24 @@ void testStrdupContentIsDefined(const char *s, unsigned validIndex) {
   free(s2);
 }
 
+void testWinStrdupContentIsDefined(const char *s, unsigned validIndex) {
+  char *s2 = _strdup(s);
+  char result = s2[1];// no warning
+  free(s2);
+}
+
+void testWcsdupContentIsDefined(const wchar_t *s, unsigned validIndex) {
+  wchar_t *s2 = wcsdup(s);
+  wchar_t result = s2[1];// no warning
+  free(s2);
+}
+
+void testWinWcsdupContentIsDefined(const wchar_t *s, unsigned validIndex) {
+  wchar_t *s2 = _wcsdup(s);
+  wchar_t result = s2[1];// no warning
+  free(s2);
+}
+
 // ----------------------------------------------------------------------------
 // Test the system library functions to which the pointer can escape.
 // This tests false positive suppression.
@@ -1444,6 +1512,14 @@ char *testLeakWithinReturn(char *str) {
   return strdup(strdup(str)); // expected-warning{{leak}}
 }
 
+char *testWinLeakWithinReturn(char *str) {
+  return _strdup(_strdup(str)); // expected-warning{{leak}}
+}
+
+wchar_t *testWinWideLeakWithinReturn(wchar_t *str) {
+  return _wcsdup(_wcsdup(str)); // expected-warning{{leak}}
+}
+
 void passConstPtr(const char * ptr);
 
 void testPassConstPointer() {
@@ -1674,6 +1750,19 @@ void testEscapeThroughSystemCallTakingVoidPointer3(fake_rb_tree_t *rbt) {
   fake_rb_tree_insert_node(rbt, data); // no warning
 }
 
+struct IntAndPtr {
+  int x;
+  int *p;
+};
+
+void constEscape(const void *ptr);
+
+void testConstEscapeThroughAnotherField() {
+  struct IntAndPtr s;
+  s.p = malloc(sizeof(int));
+  constEscape(&(s.x)); // could free s->p!
+} // no-warning
+
 // ----------------------------------------------------------------------------
 // False negatives.
 
@@ -1693,3 +1782,9 @@ void testPassToSystemHeaderFunctionIndirectly() {
   // FIXME: This is a leak: if we think a system function won't free p, it
   // won't free (p-1) either.
 }
+
+void testMallocIntoMalloc() {
+  StructWithPtr *s = malloc(sizeof(StructWithPtr));
+  s->memP = malloc(sizeof(int));
+  free(s);
+} // FIXME: should warn here
diff --git a/test/Analysis/mpichecker.cpp b/test/Analysis/mpichecker.cpp
new file mode 100644
index 0000000000000..b7a1e00e00b45
--- /dev/null
+++ b/test/Analysis/mpichecker.cpp
@@ -0,0 +1,342 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -verify %s
+
+#include "MPIMock.h"
+
+void matchedWait1() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+} // no error
+
+void matchedWait2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+} // no error
+
+void matchedWait3() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank >= 0) {
+    MPI_Request sendReq1, recvReq1;
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1);
+
+    if (rank > 1000) {
+      MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+      MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+    } else {
+      MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+      MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+    }
+  }
+} // no error
+
+void missingWait1() { // Check missing wait for dead region.
+  double buf = 0;
+  MPI_Request sendReq1;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &sendReq1);
+} // expected-warning{{Request 'sendReq1' has no matching wait.}}
+
+void missingWait2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 0) {
+  } else {
+    MPI_Request sendReq1, recvReq1;
+
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &recvReq1); // expected-warning{{Request 'sendReq1' has no matching wait.}}
+    MPI_Wait(&recvReq1, MPI_STATUS_IGNORE);
+  }
+}
+
+void doubleNonblocking() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  if (rank == 1) {
+  } else {
+    MPI_Request sendReq1;
+
+    MPI_Isend(&buf, 1, MPI_DOUBLE, rank + 1, 0, MPI_COMM_WORLD, &sendReq1);
+    MPI_Irecv(&buf, 1, MPI_DOUBLE, rank - 1, 0, MPI_COMM_WORLD, &sendReq1); // expected-warning{{Double nonblocking on request 'sendReq1'.}}
+    MPI_Wait(&sendReq1, MPI_STATUS_IGNORE);
+  }
+}
+
+void doubleNonblocking2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Request req;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req); // expected-warning{{Double nonblocking on request 'req'.}}
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+}
+
+void doubleNonblocking3() {
+  typedef struct { MPI_Request req; } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &rs.req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &rs.req); // expected-warning{{Double nonblocking on request 'rs.req'.}}
+  MPI_Wait(&rs.req, MPI_STATUS_IGNORE);
+}
+
+void doubleNonblocking4() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Request req;
+  for (int i = 0; i < 2; ++i) {
+    MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &req); // expected-warning{{Double nonblocking on request 'req'.}}
+  }
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+}
+
+void tripleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq);
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}}
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}}
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
+
+void missingNonBlocking() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq1[10][10][10];
+  MPI_Wait(&sendReq1[1][7][9], MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq1[1][7][9]' has no matching nonblocking call.}}
+}
+
+void missingNonBlocking2() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  typedef struct { MPI_Request req[2][2]; } ReqStruct;
+  ReqStruct rs;
+  MPI_Request *r = &rs.req[0][1];
+  MPI_Wait(r, MPI_STATUS_IGNORE); // expected-warning{{Request 'rs.req[0][1]' has no matching nonblocking call.}}
+}
+
+void missingNonBlocking3() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq;
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE); // expected-warning{{Request 'sendReq' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingMultiple() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request sendReq[4];
+  for (int i = 0; i < 4; ++i) {
+    MPI_Wait(&sendReq[i], MPI_STATUS_IGNORE); // expected-warning-re 1+{{Request {{.*}} has no matching nonblocking call.}}
+  }
+}
+
+void missingNonBlockingWaitall() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[3]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning{{Request 'req[2]' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall2() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[3]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 2{{Request '{{(.*)[[1-2]](.*)}}' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall3() {
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+      &req[2]);
+
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 2{{Request '{{(.*)[[1,3]](.*)}}' has no matching nonblocking call.}}
+}
+
+void missingNonBlockingWaitall4() {
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Request req[4];
+  MPI_Waitall(4, req, MPI_STATUSES_IGNORE); // expected-warning-re 4{{Request '{{(.*)[[0-3]](.*)}}' has no matching nonblocking call.}}
+}
+
+void noDoubleRequestUsage() {
+  typedef struct {
+    MPI_Request req;
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Wait(&rs.req, MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void noDoubleRequestUsage2() {
+  typedef struct {
+    MPI_Request req[2];
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Wait(&rs.req[0], MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req[1], MPI_STATUS_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void nestedRequest() {
+  typedef struct {
+    MPI_Request req[2];
+    MPI_Request req2;
+  } ReqStruct;
+
+  ReqStruct rs;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[0]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req[1]);
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &rs.req2);
+  MPI_Waitall(2, rs.req, MPI_STATUSES_IGNORE);
+  MPI_Wait(&rs.req2, MPI_STATUS_IGNORE);
+} // no error
+
+void singleRequestInWaitall() {
+  MPI_Request r;
+  int rank = 0;
+  double buf = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &r);
+  MPI_Waitall(1, &r, MPI_STATUSES_IGNORE);
+} // no error
+
+void multiRequestUsage() {
+  double buf = 0;
+  MPI_Request req;
+
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 1, 0, MPI_COMM_WORLD, &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+} // no error
+
+void multiRequestUsage2() {
+  double buf = 0;
+  MPI_Request req;
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  MPI_Wait(&req, MPI_STATUS_IGNORE);
+} // no error
+
+// wrapper function
+void callNonblocking(MPI_Request *req) {
+  double buf = 0;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+             req);
+}
+
+// wrapper function
+void callWait(MPI_Request *req) {
+  MPI_Wait(req, MPI_STATUS_IGNORE);
+}
+
+// Call nonblocking, wait wrapper functions.
+void callWrapperFunctions() {
+  MPI_Request req;
+  callNonblocking(&req);
+  callWait(&req);
+} // no error
+
+void externFunctions1() {
+  double buf = 0;
+  MPI_Request req;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD,
+              &req);
+  void callWaitExtern(MPI_Request *req);
+  callWaitExtern(&req);
+} // expected-warning{{Request 'req' has no matching wait.}}
+
+void externFunctions2() {
+  MPI_Request req;
+  void callNonblockingExtern(MPI_Request *req);
+  callNonblockingExtern(&req);
+}
diff --git a/test/Analysis/mpicheckernotes.cpp b/test/Analysis/mpicheckernotes.cpp
new file mode 100644
index 0000000000000..be312fdf5fda1
--- /dev/null
+++ b/test/Analysis/mpicheckernotes.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=optin.mpi.MPI-Checker -analyzer-output=text -verify %s
+
+// MPI-Checker test file to test note diagnostics.
+
+#include "MPIMock.h"
+
+void doubleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-note{{Request is previously used by nonblocking call here.}}
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
+
+void missingWait() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Ireduce(MPI_IN_PLACE, &buf, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD, &sendReq); // expected-note{{Request is previously used by nonblocking call here.}}
+} // expected-warning{{Request 'sendReq' has no matching wait.}} expected-note{{Request 'sendReq' has no matching wait.}}
+
+// If more than 2 nonblocking calls are using a request in a sequence, they all
+// point to the first call as the 'previous' call. This is because the
+// BugReporterVisitor only checks for differences in state or existence of an
+// entity.
+void tripleNonblocking() {
+  double buf = 0;
+  MPI_Request sendReq;
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-note 2{{Request is previously used by nonblocking call here.}}
+  MPI_Irecv(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+
+  MPI_Isend(&buf, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &sendReq); // expected-warning{{Double nonblocking on request 'sendReq'.}} expected-note{{Double nonblocking on request 'sendReq'.}}
+
+  MPI_Wait(&sendReq, MPI_STATUS_IGNORE);
+}
diff --git a/test/Analysis/nullability-no-arc.mm b/test/Analysis/nullability-no-arc.mm
new file mode 100644
index 0000000000000..37d29b7457afa
--- /dev/null
+++ b/test/Analysis/nullability-no-arc.mm
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,nullability -verify %s
+
+#define nil 0
+
+@protocol NSObject
++ (id)alloc;
+- (id)init;
+- (instancetype)autorelease;
+- (void)release;
+@end
+
+__attribute__((objc_root_class))
+@interface
+NSObject<NSObject>
+@end
+
+@interface TestObject : NSObject
+@end
+
+TestObject * _Nonnull returnsNilObjCInstanceIndirectly() {
+  TestObject *local = 0;
+  return local; // expected-warning {{Null is returned from a function that is expected to return a non-null value}}
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceIndirectlyWithSupressingCast() {
+  TestObject *local = 0;
+  return (TestObject * _Nonnull)local; // no-warning
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceDirectly() {
+  // The first warning is from Sema. The second is from the static analyzer.
+  return nil; // expected-warning {{null returned from function that requires a non-null return value}}
+              // expected-warning@-1 {{Null is returned from a function that is expected to return a non-null value}}
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceDirectlyWithSuppressingCast() {
+  return (TestObject * _Nonnull)nil; // no-warning
+}
+
+void testObjCNonARCNoInitialization(TestObject * _Nonnull p) {
+  TestObject * _Nonnull implicitlyZeroInitialized; // no-warning
+  implicitlyZeroInitialized = p;
+}
+
+void testObjCNonARCExplicitZeroInitialization() {
+  TestObject * _Nonnull explicitlyZeroInitialized = nil; // expected-warning {{Null is assigned to a pointer which is expected to have non-null value}}
+}
+
+@interface ClassWithInitializers : NSObject
+@end
+
+@implementation ClassWithInitializers
+- (instancetype _Nonnull)initWithNonnullReturnAndSelfCheckingIdiom {
+  // This defensive check is a common-enough idiom that we don't want
+  // to issue a diagnostic for it.
+  if (self = [super init]) {
+  }
+
+  return self; // no-warning
+}
+
+- (instancetype _Nonnull)initWithNonnullReturnAndNilReturnViaLocal {
+  self = [super init];
+  // This leaks, but we're not checking for that here.
+
+  ClassWithInitializers *other = nil;
+  // False negative. Once we have more subtle suppression of defensive checks in
+  // initializers we should warn here.
+  return other;
+}
+
+- (instancetype _Nonnull)initWithPreconditionViolation:(int)p {
+  self = [super init];
+  if (p < 0) {
+    [self release];
+    return (ClassWithInitializers * _Nonnull)nil;
+  }
+  return self;
+}
+
++ (instancetype _Nonnull)factoryCallingInitWithNonnullReturnAndSelfCheckingIdiom {
+  return [[[self alloc] initWithNonnullReturnAndSelfCheckingIdiom] autorelease]; // no-warning
+}
+
++ (instancetype _Nonnull)factoryCallingInitWithNonnullReturnAndNilReturnViaLocal {
+  return [[[self alloc] initWithNonnullReturnAndNilReturnViaLocal] autorelease]; // no-warning
+}
+
++ (instancetype _Nonnull)initWithPreconditionViolation:(int) p {
+  return [[[self alloc] initWithPreconditionViolation:p] autorelease]; // no-warning
+}
+
+- (TestObject * _Nonnull) returnsNil {
+  return (TestObject * _Nonnull)nil;
+}
+- (TestObject * _Nonnull) inlineOfReturnsNilObjCInstanceDirectlyWithSuppressingCast {
+  TestObject *o = [self returnsNil];
+  return o;
+}
+@end
diff --git a/test/Analysis/nullability.mm b/test/Analysis/nullability.mm
index 2c29d0088e18f..c6d6519d90e74 100644
--- a/test/Analysis/nullability.mm
+++ b/test/Analysis/nullability.mm
@@ -1,25 +1,7 @@
-// RUN: %clang_cc1 -fobjc-arc -analyze -analyzer-checker=core,nullability -verify %s
+// RUN: %clang_cc1 -fblocks -analyze -analyzer-checker=core,nullability -DNOSYSTEMHEADERS=0 -verify %s
+// RUN: %clang_cc1 -fblocks -analyze -analyzer-checker=core,nullability -analyzer-config nullability:NoDiagnoseCallsToSystemHeaders=true -DNOSYSTEMHEADERS=1 -verify %s
 
-#define nil 0
-#define BOOL int
-
-@protocol NSObject
-+ (id)alloc;
-- (id)init;
-@end
-
-@protocol NSCopying
-@end
-
-__attribute__((objc_root_class))
-@interface
-NSObject<NSObject>
-@end
-
-@interface NSString : NSObject<NSCopying>
-- (BOOL)isEqualToString : (NSString *_Nonnull)aString;
-- (NSString *)stringByAppendingString:(NSString *_Nonnull)aString;
-@end
+#include "Inputs/system-header-simulator-for-nullability.h"
 
 @interface TestObject : NSObject
 - (int *_Nonnull)returnsNonnull;
@@ -58,16 +40,16 @@ void testBasicRules() {
   // Make every dereference a different path to avoid sinks after errors.
   switch (getRandom()) {
   case 0: {
-    Dummy &r = *p; // expected-warning {{}}
+    Dummy &r = *p; // expected-warning {{Nullable pointer is dereferenced}}
   } break;
   case 1: {
-    int b = p->val; // expected-warning {{}}
+    int b = p->val; // expected-warning {{Nullable pointer is dereferenced}}
   } break;
   case 2: {
-    int stuff = *ptr; // expected-warning {{}}
+    int stuff = *ptr; // expected-warning {{Nullable pointer is dereferenced}}
   } break;
   case 3:
-    takesNonnull(p); // expected-warning {{}}
+    takesNonnull(p); // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
   case 4: {
     Dummy d;
@@ -89,11 +71,11 @@ void testBasicRules() {
   Dummy *q = 0;
   if (getRandom()) {
     takesNullable(q);
-    takesNonnull(q); // expected-warning {{}}
+    takesNonnull(q); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
   }
   Dummy a;
   Dummy *_Nonnull nonnull = &a;
-  nonnull = q; // expected-warning {{}}
+  nonnull = q; // expected-warning {{Null is assigned to a pointer which is expected to have non-null value}}
   q = &a;
   takesNullable(q);
   takesNonnull(q);
@@ -106,26 +88,26 @@ void testArgumentTracking(Dummy *_Nonnull nonnull, Dummy *_Nullable nullable) {
   Dummy *p = nullable;
   Dummy *q = nonnull;
   switch(getRandom()) {
-  case 1: nonnull = p; break; // expected-warning {{}}
+  case 1: nonnull = p; break; // expected-warning {{Nullable pointer is assigned to a pointer which is expected to have non-null value}}
   case 2: p = 0; break;
   case 3: q = p; break;
   case 4: testMultiParamChecking(nonnull, nullable, nonnull); break;
   case 5: testMultiParamChecking(nonnull, nonnull, nonnull); break;
-  case 6: testMultiParamChecking(nonnull, nullable, nullable); break; // expected-warning {{}}
-  case 7: testMultiParamChecking(nullable, nullable, nonnull); // expected-warning {{}}
-  case 8: testMultiParamChecking(nullable, nullable, nullable); // expected-warning {{}}
+  case 6: testMultiParamChecking(nonnull, nullable, nullable); break; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 3rd parameter}}
+  case 7: testMultiParamChecking(nullable, nullable, nonnull); // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
+  case 8: testMultiParamChecking(nullable, nullable, nullable); // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
   case 9: testMultiParamChecking((Dummy *_Nonnull)0, nullable, nonnull); break;
   }
 }
 
 Dummy *_Nonnull testNullableReturn(Dummy *_Nullable a) {
   Dummy *p = a;
-  return p; // expected-warning {{}}
+  return p; // expected-warning {{Nullable pointer is returned from a function that is expected to return a non-null value}}
 }
 
 Dummy *_Nonnull testNullReturn() {
   Dummy *p = 0;
-  return p; // expected-warning {{}}
+  return p; // expected-warning {{Null is returned from a function that is expected to return a non-null value}}
 }
 
 void testObjCMessageResultNullability() {
@@ -147,20 +129,20 @@ void testObjCMessageResultNullability() {
     break;
   case 3:
     shouldBeNullable = [eraseNullab(getNullableTestObject()) returnsNullable];
-    [o takesNonnull:shouldBeNullable]; // expected-warning {{}}
+    [o takesNonnull:shouldBeNullable]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
   case 4:
     shouldBeNullable = [eraseNullab(getNonnullTestObject()) returnsNullable];
-    [o takesNonnull:shouldBeNullable]; // expected-warning {{}}
+    [o takesNonnull:shouldBeNullable]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
   case 5:
     shouldBeNullable =
         [eraseNullab(getUnspecifiedTestObject()) returnsNullable];
-    [o takesNonnull:shouldBeNullable]; // expected-warning {{}}
+    [o takesNonnull:shouldBeNullable]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
   case 6:
     shouldBeNullable = [eraseNullab(getNullableTestObject()) returnsNullable];
-    [o takesNonnull:shouldBeNullable]; // expected-warning {{}}
+    [o takesNonnull:shouldBeNullable]; // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
     break;
   case 7: {
     int *shouldBeNonnull = [eraseNullab(getNonnullTestObject()) returnsNonnull];
@@ -189,7 +171,59 @@ Dummy * _Nonnull testDirectCastNilToNonnull() {
 void testIndirectCastNilToNonnullAndPass() {
   Dummy *p = (Dummy * _Nonnull)0;
   // FIXME: Ideally the cast above would suppress this warning.
-  takesNonnull(p);  // expected-warning {{Null passed to a callee that requires a non-null argument}}
+  takesNonnull(p);  // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
+}
+
+void testDirectCastNilToNonnullAndAssignToLocalInInitializer() {
+  Dummy * _Nonnull nonnullLocalWithAssignmentInInitializer = (Dummy * _Nonnull)0; // no-warning
+  (void)nonnullLocalWithAssignmentInInitializer;
+
+  // Since we've already had an invariant violation along this path,
+  // we shouldn't warn here.
+  nonnullLocalWithAssignmentInInitializer = 0;
+  (void)nonnullLocalWithAssignmentInInitializer;
+
+}
+
+void testDirectCastNilToNonnullAndAssignToLocal(Dummy * _Nonnull p) {
+  Dummy * _Nonnull nonnullLocalWithAssignment = p;
+  nonnullLocalWithAssignment = (Dummy * _Nonnull)0; // no-warning
+  (void)nonnullLocalWithAssignment;
+
+  // Since we've already had an invariant violation along this path,
+  // we shouldn't warn here.
+  nonnullLocalWithAssignment = 0;
+  (void)nonnullLocalWithAssignment;
+}
+
+void testDirectCastNilToNonnullAndAssignToParam(Dummy * _Nonnull p) {
+  p = (Dummy * _Nonnull)0; // no-warning
+}
+
+@interface ClassWithNonnullIvar : NSObject {
+  Dummy *_nonnullIvar;
+}
+@end
+
+@implementation ClassWithNonnullIvar
+-(void)testDirectCastNilToNonnullAndAssignToIvar {
+  _nonnullIvar = (Dummy * _Nonnull)0; // no-warning;
+
+  // Since we've already had an invariant violation along this path,
+  // we shouldn't warn here.
+  _nonnullIvar = 0;
+}
+@end
+
+void testIndirectNilPassToNonnull() {
+  Dummy *p = 0;
+  takesNonnull(p);  // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
+}
+
+void testConditionalNilPassToNonnull(Dummy *p) {
+  if (!p) {
+    takesNonnull(p);  // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
+  }
 }
 
 Dummy * _Nonnull testIndirectCastNilToNonnullAndReturn() {
@@ -206,7 +240,7 @@ void testInvalidPropagation() {
 
 void onlyReportFirstPreconditionViolationOnPath() {
   Dummy *p = returnsNullable();
-  takesNonnull(p); // expected-warning {{}}
+  takesNonnull(p); // expected-warning {{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
   takesNonnull(p); // No warning.
   // The first warning was not a sink. The analysis expected to continue.
   int i = 0;
@@ -245,6 +279,41 @@ void testPreconditionViolationInInlinedFunction(Dummy *p) {
   doNotWarnWhenPreconditionIsViolated(p);
 }
 
+@interface TestInlinedPreconditionViolationClass : NSObject
+@end
+
+@implementation TestInlinedPreconditionViolationClass
+-(Dummy * _Nonnull) calleeWithParam:(Dummy * _Nonnull) p2 {
+  Dummy *x = 0;
+  if (!p2) // p2 binding becomes dead at this point.
+    return x; // no-warning
+  else
+   return p2;
+}
+
+-(Dummy *)callerWithParam:(Dummy * _Nonnull) p1 {
+  return [self calleeWithParam:p1];
+}
+
+@end
+
+int * _Nonnull InlinedPreconditionViolationInFunctionCallee(int * _Nonnull p2) {
+  int *x = 0;
+  if (!p2) // p2 binding becomes dead at this point.
+    return x; // no-warning
+  else
+   return p2;
+}
+
+int * _Nonnull InlinedReturnNullOverSuppressionCallee(int * _Nonnull p2) {
+  int *result = 0;
+  return result; // no-warning; but this is an over suppression
+}
+
+int *InlinedReturnNullOverSuppressionCaller(int * _Nonnull p1) {
+  return InlinedReturnNullOverSuppressionCallee(p1);
+}
+
 void inlinedNullable(Dummy *_Nullable p) {
   if (p) return;
 }
@@ -255,6 +324,14 @@ void inlinedUnspecified(Dummy *p) {
   if (p) return;
 }
 
+void testNilReturnWithBlock(Dummy *p) {
+  p = 0;
+  Dummy *_Nonnull (^myblock)(void) = ^Dummy *_Nonnull(void) {
+    return p; // TODO: We should warn in blocks.
+  };
+  myblock();
+}
+
 Dummy *_Nonnull testDefensiveInlineChecks(Dummy * p) {
   switch (getRandom()) {
   case 1: inlinedNullable(p); break;
@@ -279,11 +356,172 @@ Dummy *_Nonnull testDefensiveInlineChecks(Dummy * p) {
   return p;
 }
 
-void testObjCARCImplicitZeroInitialization() {
-  TestObject * _Nonnull implicitlyZeroInitialized; // no-warning
-  implicitlyZeroInitialized = getNonnullTestObject();
+
+@interface SomeClass : NSObject {
+  int instanceVar;
+}
+@end
+
+@implementation SomeClass (MethodReturn)
+- (id)initWithSomething:(int)i {
+  if (self = [super init]) {
+    instanceVar = i;
+  }
+
+  return self;
+}
+
+- (TestObject * _Nonnull)testReturnsNullableInNonnullIndirectly {
+  TestObject *local = getNullableTestObject();
+  return local; // expected-warning {{Nullable pointer is returned from a method that is expected to return a non-null value}}
+}
+
+- (TestObject * _Nonnull)testReturnsCastSuppressedNullableInNonnullIndirectly {
+  TestObject *local = getNullableTestObject();
+  return (TestObject * _Nonnull)local; // no-warning
+}
+
+- (TestObject * _Nonnull)testReturnsNullableInNonnullWhenPreconditionViolated:(TestObject * _Nonnull) p {
+  TestObject *local = getNullableTestObject();
+  if (!p) // Pre-condition violated here.
+    return local; // no-warning
+  else
+    return p; // no-warning
+}
+@end
+
+@interface ClassWithInitializers : NSObject
+@end
+
+@implementation ClassWithInitializers
+- (instancetype _Nonnull)initWithNonnullReturnAndSelfCheckingIdiom {
+  // This defensive check is a common-enough idiom that we filter don't want
+  // to issue a diagnostic for it,
+  if (self = [super init]) {
+  }
+
+  return self; // no-warning
+}
+
+- (instancetype _Nonnull)initWithNonnullReturnAndNilReturnViaLocal {
+  self = [super init];
+  // This leaks, but we're not checking for that here.
+
+  ClassWithInitializers *other = nil;
+  // False negative. Once we have more subtle suppression of defensive checks in
+  // initializers we should warn here.
+  return other;
+}
+@end
+
+@interface SubClassWithInitializers : ClassWithInitializers
+@end
+
+@implementation SubClassWithInitializers
+// Note: Because this is overridding
+// -[ClassWithInitializers initWithNonnullReturnAndSelfCheckingIdiom],
+// the return type of this method becomes implicitly id _Nonnull.
+- (id)initWithNonnullReturnAndSelfCheckingIdiom {
+  if (self = [super initWithNonnullReturnAndSelfCheckingIdiom]) {
+  }
+
+  return self; // no-warning
+}
+
+- (id _Nonnull)initWithNonnullReturnAndSelfCheckingIdiomV2; {
+  // Another common return-checking idiom
+  self = [super initWithNonnullReturnAndSelfCheckingIdiom];
+  if (!self) {
+    return nil; // no-warning
+  }
+
+  return self;
 }
+@end
 
-void testObjCARCExplicitZeroInitialization() {
-  TestObject * _Nonnull explicitlyZeroInitialized = nil; // expected-warning {{Null is assigned to a pointer which is expected to have non-null value}}
+@interface ClassWithCopyWithZone : NSObject<NSCopying,NSMutableCopying> {
+  id i;
 }
+
+@end
+
+@implementation ClassWithCopyWithZone
+-(id)copyWithZone:(NSZone *)zone {
+  ClassWithCopyWithZone *newInstance = [[ClassWithCopyWithZone alloc] init];
+  if (!newInstance)
+    return nil;
+
+  newInstance->i = i;
+  return newInstance;
+}
+
+-(id)mutableCopyWithZone:(NSZone *)zone {
+  ClassWithCopyWithZone *newInstance = [[ClassWithCopyWithZone alloc] init];
+  if (newInstance) {
+    newInstance->i = i;
+  }
+
+  return newInstance;
+}
+@end
+
+NSString * _Nullable returnsNullableString();
+
+void callFunctionInSystemHeader() {
+  NSString *s = returnsNullableString();
+
+  NSSystemFunctionTakingNonnull(s);
+  #if !NOSYSTEMHEADERS
+  // expected-warning@-2{{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
+  #endif
+}
+
+void callMethodInSystemHeader() {
+  NSString *s = returnsNullableString();
+
+  NSSystemClass *sc = [[NSSystemClass alloc] init];
+  [sc takesNonnull:s];
+  #if !NOSYSTEMHEADERS
+  // expected-warning@-2{{Nullable pointer is passed to a callee that requires a non-null 1st parameter}}
+  #endif
+}
+
+// Test to make sure the analyzer doesn't warn when an a nullability invariant
+// has already been found to be violated on an instance variable.
+
+@class MyInternalClass;
+@interface MyClass : NSObject {
+  MyInternalClass * _Nonnull _internal;
+}
+@end
+
+@interface MyInternalClass : NSObject {
+  @public
+  id _someIvar;
+}
+-(id _Nonnull)methodWithInternalImplementation;
+@end
+
+@interface MyClass () {
+  MyInternalClass * _Nonnull _nilledOutInternal;
+}
+@end
+
+@implementation MyClass
+-(id _Nonnull)methodWithInternalImplementation {
+  if (!_internal)
+    return nil; // no-warning
+
+  return [_internal methodWithInternalImplementation];
+}
+
+- (id _Nonnull)methodReturningIvarInImplementation; {
+  return _internal == 0 ? nil : _internal->_someIvar; // no-warning
+}
+
+-(id _Nonnull)methodWithNilledOutInternal {
+  _nilledOutInternal = (id _Nonnull)nil;
+
+  return nil; // no-warning
+}
+@end
diff --git a/test/Analysis/nullability_nullonly.mm b/test/Analysis/nullability_nullonly.mm
index 56b3f9e144902..9671877719f36 100644
--- a/test/Analysis/nullability_nullonly.mm
+++ b/test/Analysis/nullability_nullonly.mm
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,nullability.NullPassedToNonnull,nullability.NullReturnedFromNonnull -verify %s
+// RUN: %clang_cc1 -analyze -fobjc-arc -analyzer-checker=core,nullability.NullPassedToNonnull,nullability.NullReturnedFromNonnull -DNOSYSTEMHEADERS=0 -verify %s
+// RUN: %clang_cc1 -analyze -fobjc-arc -analyzer-checker=core,nullability.NullPassedToNonnull,nullability.NullReturnedFromNonnull -analyzer-config nullability:NoDiagnoseCallsToSystemHeaders=true -DNOSYSTEMHEADERS=1 -verify %s
+
+#include "Inputs/system-header-simulator-for-nullability.h"
 
 int getRandom();
 
@@ -15,18 +18,18 @@ void testBasicRules() {
   Dummy *q = 0;
   if (getRandom()) {
     takesNullable(q);
-    takesNonnull(q); // expected-warning {{}}
+    takesNonnull(q); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
   }
 }
 
 Dummy *_Nonnull testNullReturn() {
   Dummy *p = 0;
-  return p; // expected-warning {{}}
+  return p; // expected-warning {{Null is returned from a function that is expected to return a non-null value}}
 }
 
 void onlyReportFirstPreconditionViolationOnPath() {
   Dummy *p = 0;
-  takesNonnull(p); // expected-warning {{}}
+  takesNonnull(p); // expected-warning {{Null passed to a callee that requires a non-null 1st parameter}}
   takesNonnull(p); // No warning.
   // Passing null to nonnull is a sink. Stop the analysis.
   int i = 0;
@@ -85,3 +88,83 @@ Dummy *_Nonnull testDefensiveInlineChecks(Dummy * p) {
     takesNonnull(p);
   return p;
 }
+
+@interface TestObject : NSObject
+@end
+
+TestObject *_Nonnull getNonnullTestObject();
+
+void testObjCARCImplicitZeroInitialization() {
+  TestObject * _Nonnull implicitlyZeroInitialized; // no-warning
+  implicitlyZeroInitialized = getNonnullTestObject();
+}
+
+void testObjCARCExplicitZeroInitialization() {
+  TestObject * _Nonnull explicitlyZeroInitialized = nil; // expected-warning {{Null is assigned to a pointer which is expected to have non-null value}}
+}
+
+// Under ARC, returned expressions of ObjC objects types are are implicitly
+// cast to _Nonnull when the functions return type is _Nonnull, so make
+// sure this doesn't implicit cast doesn't suppress a legitimate warning.
+TestObject * _Nonnull returnsNilObjCInstanceIndirectly() {
+  TestObject *local = 0;
+  return local; // expected-warning {{Null is returned from a function that is expected to return a non-null value}}
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceIndirectlyWithSupressingCast() {
+  TestObject *local = 0;
+  return (TestObject * _Nonnull)local; // no-warning
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceDirectly() {
+  return nil; // expected-warning {{Null is returned from a function that is expected to return a non-null value}}
+}
+
+TestObject * _Nonnull returnsNilObjCInstanceDirectlyWithSuppressingCast() {
+  return (TestObject * _Nonnull)nil; // no-warning
+}
+
+@interface SomeClass : NSObject
+@end
+
+@implementation SomeClass (MethodReturn)
+- (SomeClass * _Nonnull)testReturnsNilInNonnull {
+  SomeClass *local = nil;
+  return local; // expected-warning {{Null is returned from a method that is expected to return a non-null value}}
+}
+
+- (SomeClass * _Nonnull)testReturnsCastSuppressedNilInNonnull {
+  SomeClass *local = nil;
+  return (SomeClass * _Nonnull)local; // no-warning
+}
+
+- (SomeClass * _Nonnull)testReturnsNilInNonnullWhenPreconditionViolated:(SomeClass * _Nonnull) p {
+  SomeClass *local = nil;
+  if (!p) // Pre-condition violated here.
+    return local; // no-warning
+  else
+    return p; // no-warning
+}
+@end
+
+
+void callFunctionInSystemHeader() {
+  NSString *s;
+  s = nil;
+
+  NSSystemFunctionTakingNonnull(s);
+  #if !NOSYSTEMHEADERS
+  // expected-warning@-2{{Null passed to a callee that requires a non-null 1st parameter}}
+  #endif
+}
+
+void callMethodInSystemHeader() {
+  NSString *s;
+  s = nil;
+
+  NSSystemClass *sc = [[NSSystemClass alloc] init];
+  [sc takesNonnull:s];
+  #if !NOSYSTEMHEADERS
+  // expected-warning@-2{{Null passed to a callee that requires a non-null 1st parameter}}
+  #endif
+}
diff --git a/test/Analysis/properties.m b/test/Analysis/properties.m
index bf9424c8c2068..b1305341e5d4b 100644
--- a/test/Analysis/properties.m
+++ b/test/Analysis/properties.m
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,osx.cocoa.RetainCount,debug.ExprInspection -analyzer-store=region -verify -Wno-objc-root-class %s
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,osx.cocoa.RetainCount,debug.ExprInspection -analyzer-store=region -verify -Wno-objc-root-class -fobjc-arc %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,osx.cocoa.RetainCount,osx.cocoa.Dealloc,debug.ExprInspection -analyzer-store=region -verify -Wno-objc-root-class %s
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,osx.cocoa.RetainCount,osx.cocoa.Dealloc,debug.ExprInspection -analyzer-store=region -verify -Wno-objc-root-class -fobjc-arc %s
 
 void clang_analyzer_eval(int);
 
@@ -22,6 +22,7 @@ typedef struct _NSZone NSZone;
 -(id)copy;
 -(id)retain;
 -(oneway void)release;
+-(void)dealloc;
 @end
 @interface NSString : NSObject <NSCopying, NSMutableCopying, NSCoding>
 - (NSUInteger)length;
@@ -138,6 +139,14 @@ NSNumber* numberFromMyNumberProperty(MyNumber* aMyNumber)
 
 @implementation Person
 @synthesize name = _name;
+
+-(void)dealloc {
+#if !__has_feature(objc_arc)
+  self.name = [[NSString alloc] init]; // expected-warning {{leak}}
+
+  [super dealloc]; // expected-warning {{The '_name' ivar in 'Person' was retained by a synthesized property but not released before '[super dealloc]}}
+#endif
+}
 @end
 
 #if !__has_feature(objc_arc)
@@ -211,6 +220,278 @@ void testConsistencyAssign(Person *p) {
   clang_analyzer_eval(p.friend == origFriend); // expected-warning{{UNKNOWN}}
 }
 
+@interface ClassWithShadowedReadWriteProperty {
+  int _f;
+}
+@property (readonly) int someProp;
+@end
+
+@interface ClassWithShadowedReadWriteProperty ()
+@property (readwrite) int someProp;
+@end
+
+@implementation ClassWithShadowedReadWriteProperty
+- (void)testSynthesisForShadowedReadWriteProperties; {
+  clang_analyzer_eval(self.someProp == self.someProp); // expected-warning{{TRUE}}
+
+  _f = 1;
+
+  // Read of shadowed property should not invalidate receiver.
+  (void)self.someProp;
+  clang_analyzer_eval(_f == 1); // expected-warning{{TRUE}}
+
+  _f = 2;
+  // Call to getter of shadowed property should not invalidate receiver.
+  (void)[self someProp];
+  clang_analyzer_eval(_f == 2); // expected-warning{{TRUE}}
+}
+@end
+
+// Tests for the analyzer fix that works around a Sema bug
+// where multiple methods are created for properties in class extensions that
+// are redeclared in a category method.
+// The Sema bug is tracked as <rdar://problem/25481164>.
+@interface ClassWithRedeclaredPropertyInExtensionFollowedByCategory
+@end
+
+@interface ClassWithRedeclaredPropertyInExtensionFollowedByCategory ()
+@end
+
+@interface ClassWithRedeclaredPropertyInExtensionFollowedByCategory ()
+@property (readwrite) int someProp;
+@property (readonly) int otherProp;
+@end
+
+@interface ClassWithRedeclaredPropertyInExtensionFollowedByCategory (MyCat)
+@property (readonly) int someProp;
+@property (readonly) int otherProp;
+@end
+
+@implementation ClassWithRedeclaredPropertyInExtensionFollowedByCategory
+- (void)testSynthesisForRedeclaredProperties; {
+  clang_analyzer_eval(self.someProp == self.someProp); // expected-warning{{TRUE}}
+  clang_analyzer_eval([self someProp] == self.someProp); // expected-warning{{TRUE}}
+
+  clang_analyzer_eval(self.otherProp == self.otherProp); // expected-warning{{TRUE}}
+  clang_analyzer_eval([self otherProp] == self.otherProp); // expected-warning{{TRUE}}
+}
+@end
+
+// The relative order of the extension and the category matter, so test both.
+@interface ClassWithRedeclaredPropertyInCategoryFollowedByExtension
+@end
+
+@interface ClassWithRedeclaredPropertyInCategoryFollowedByExtension ()
+@property (readwrite) int someProp;
+@end
+
+@interface ClassWithRedeclaredPropertyInCategoryFollowedByExtension (MyCat)
+@property (readonly) int someProp;
+@end
+
+@implementation ClassWithRedeclaredPropertyInCategoryFollowedByExtension
+- (void)testSynthesisForRedeclaredProperties; {
+  clang_analyzer_eval(self.someProp == self.someProp); // expected-warning{{TRUE}}
+  clang_analyzer_eval([self someProp] == self.someProp); // expected-warning{{TRUE}}
+}
+@end
+
+@interface ClassWithSynthesizedPropertyAndGetter
+@property (readonly) int someProp;
+@end
+
+@implementation ClassWithSynthesizedPropertyAndGetter
+@synthesize someProp;
+
+// Make sure that the actual getter is inlined and not a getter created
+// by BodyFarm
+- (void)testBodyFarmGetterNotUsed {
+  int i = self.someProp;
+  clang_analyzer_eval(i == 22); // expected-warning {{TRUE}}
+}
+
+-(int)someProp {
+  return 22;
+}
+@end
+
+//------
+// Setter ivar invalidation.
+//------
+
+@interface ClassWithSetters
+// Note: These properties have implicit @synthesize implementations to be
+// backed with ivars.
+@property (assign) int propWithIvar1;
+@property (assign) int propWithIvar2;
+
+@property (retain) NSNumber *retainedProperty;
+
+@end
+
+@interface ClassWithSetters (InOtherTranslationUnit)
+// The implementation of this property is in another translation unit.
+// We don't know whether it is backed by an ivar or not.
+@property (assign) int propInOther;
+@end
+
+@implementation ClassWithSetters
+- (void) testSettingPropWithIvarInvalidatesExactlyThatIvar; {
+  _propWithIvar1 = 1;
+  _propWithIvar2 = 2;
+  self.propWithIvar1 = 66;
+
+  // Calling the setter of a property backed by the instance variable
+  // should invalidate the storage for the instance variable but not
+  // the rest of the receiver. Ideally we would model the setter completely
+  // but doing so would cause the new value to escape when it is bound
+  // to the ivar. This would cause bad false negatives in the retain count
+  // checker. (There is a test for this scenario in
+  // testWriteRetainedValueToRetainedProperty below).
+  clang_analyzer_eval(_propWithIvar1 == 66); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(_propWithIvar2 == 2);  // expected-warning{{TRUE}}
+
+  _propWithIvar1 = 1;
+  [self setPropWithIvar1:66];
+
+  clang_analyzer_eval(_propWithIvar1 == 66); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(_propWithIvar2 == 2);  // expected-warning{{TRUE}}
+}
+
+- (void) testSettingPropWithoutIvarInvalidatesEntireInstance; {
+  _propWithIvar1 = 1;
+  _propWithIvar2 = 2;
+  self.propInOther = 66;
+
+  clang_analyzer_eval(_propWithIvar1 == 66); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(_propWithIvar2 == 2);  // expected-warning{{UNKNOWN}}
+
+  _propWithIvar1 = 1;
+  _propWithIvar2 = 2;
+  [self setPropInOther:66];
+
+  clang_analyzer_eval(_propWithIvar1 == 66); // expected-warning{{UNKNOWN}}
+  clang_analyzer_eval(_propWithIvar2 == 2);  // expected-warning{{UNKNOWN}}
+}
+
+#if !__has_feature(objc_arc)
+- (void) testWriteRetainedValueToRetainedProperty; {
+  NSNumber *number = [[NSNumber alloc] initWithInteger:5]; // expected-warning {{Potential leak of an object stored into 'number'}}
+
+  // Make sure we catch this leak.
+  self.retainedProperty = number;
+}
+#endif
+@end
+
+//------
+// class properties
+//------
+
+int gBackingForReadWriteClassProp = 0;
+
+@interface ClassWithClassProperties
+@property(class, readonly) int readOnlyClassProp;
+
+@property(class) int readWriteClassProp;
+
+// Make sure we handle when a class and instance property have the same
+// name. Test both when instance comes first and when class comes first.
+@property(readonly) int classAndInstancePropWithSameNameOrderInstanceFirst;
+@property(class, readonly) int classAndInstancePropWithSameNameOrderInstanceFirst;
+
+@property(class, readonly) int classAndInstancePropWithSameNameOrderClassFirst;
+@property(readonly) int classAndInstancePropWithSameNameOrderClassFirst;
+
+
+@property(class, readonly) int dynamicClassProp;
+
+@end
+
+@interface ClassWithClassProperties (OtherTranslationUnit)
+@property(class, assign) id propInOtherTranslationUnit;
+@end
+
+@implementation ClassWithClassProperties
+
+@dynamic dynamicClassProp;
+
++ (int)readOnlyClassProp {
+  return 1;
+}
+
++ (int)readWriteClassProp {
+  return gBackingForReadWriteClassProp;
+}
+
++ (void)setReadWriteClassProp:(int)val {
+  gBackingForReadWriteClassProp = val;
+}
+
+- (int)classAndInstancePropWithSameNameOrderInstanceFirst {
+  return 12;
+}
+
++ (int)classAndInstancePropWithSameNameOrderInstanceFirst {
+  return 13;
+}
+
++ (int)classAndInstancePropWithSameNameOrderClassFirst {
+  return 14;
+}
+
+- (int)classAndInstancePropWithSameNameOrderClassFirst {
+  return 15;
+}
+
+- (void)testInlineClassProp {
+  clang_analyzer_eval(ClassWithClassProperties.readOnlyClassProp == 1); // expected-warning{{TRUE}}
+
+  ClassWithClassProperties.readWriteClassProp = 7;
+  clang_analyzer_eval(ClassWithClassProperties.readWriteClassProp == 7); // expected-warning{{TRUE}}
+  ClassWithClassProperties.readWriteClassProp = 8;
+  clang_analyzer_eval(ClassWithClassProperties.readWriteClassProp == 8); // expected-warning{{TRUE}}
+}
+
+- (void)testUnknownClassProp {
+  clang_analyzer_eval(ClassWithClassProperties.propInOtherTranslationUnit == ClassWithClassProperties.propInOtherTranslationUnit); // expected-warning{{UNKNOWN}}
+}
+
+- (void)testEscapeGlobalOnUnknownProp {
+  gBackingForReadWriteClassProp = 33;
+  ClassWithClassProperties.propInOtherTranslationUnit = 0;
+  clang_analyzer_eval(gBackingForReadWriteClassProp == 33); // expected-warning{{UNKNOWN}}
+}
+
+- (void)testClassAndInstancePropertyWithSameName {
+  clang_analyzer_eval(self.classAndInstancePropWithSameNameOrderInstanceFirst == 12); // expected-warning{{TRUE}}
+  clang_analyzer_eval(ClassWithClassProperties.classAndInstancePropWithSameNameOrderInstanceFirst == 13); // expected-warning{{TRUE}}
+
+  clang_analyzer_eval(ClassWithClassProperties.classAndInstancePropWithSameNameOrderClassFirst == 14); // expected-warning{{TRUE}}
+  clang_analyzer_eval(self.classAndInstancePropWithSameNameOrderClassFirst == 15); // expected-warning{{TRUE}}
+}
+
+- (void)testDynamicClassProp {
+  clang_analyzer_eval(ClassWithClassProperties.dynamicClassProp == 16); // expected-warning{{UNKNOWN}}
+}
+
+@end
+
+@interface SubclassOfClassWithClassProperties : ClassWithClassProperties
+@end
+
+@implementation SubclassOfClassWithClassProperties
++ (int)dynamicClassProp; {
+ return 16;
+}
+
+- (void)testDynamicClassProp {
+  clang_analyzer_eval(SubclassOfClassWithClassProperties.dynamicClassProp == 16); // expected-warning{{TRUE}}
+}
+
+@end
+
+
 #if !__has_feature(objc_arc)
 void testOverrelease(Person *p, int coin) {
   switch (coin) {
diff --git a/test/Analysis/ptr-arith.c b/test/Analysis/ptr-arith.c
index 57463cc7c871d..2b15badf42740 100644
--- a/test/Analysis/ptr-arith.c
+++ b/test/Analysis/ptr-arith.c
@@ -52,7 +52,7 @@ void f4() {
 void f5() {
   int x, y;
   int *p;
-  p = &x + 1;  // expected-warning{{Pointer arithmetic done on non-array variables means reliance on memory layout, which is dangerous}}
+  p = &x + 1;  // expected-warning{{Pointer arithmetic on non-array variables relies on memory layout, which is dangerous}}
 
   int a[10];
   p = a + 1; // no-warning
@@ -75,7 +75,7 @@ start:
   clang_analyzer_eval(&a != 0); // expected-warning{{TRUE}}
   clang_analyzer_eval(&a >= 0); // expected-warning{{TRUE}}
   clang_analyzer_eval(&a > 0); // expected-warning{{TRUE}}
-  clang_analyzer_eval((&a - 0) != 0); // expected-warning{{TRUE}} expected-warning{{Pointer arithmetic done on non-array variables}}
+  clang_analyzer_eval((&a - 0) != 0); // expected-warning{{TRUE}}
 
   // LHS is NULL, RHS is non-symbolic
   // The same code is used for labels and non-symbolic values.
diff --git a/test/Analysis/ptr-arith.cpp b/test/Analysis/ptr-arith.cpp
index 5f0951857b13f..07ddec30af568 100644
--- a/test/Analysis/ptr-arith.cpp
+++ b/test/Analysis/ptr-arith.cpp
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,debug.ExprInspection -verify %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -Wno-unused-value -std=c++14 -analyze -analyzer-checker=core,debug.ExprInspection,alpha.core.PointerArithm -verify %s
 struct X {
   int *p;
   int zero;
@@ -20,3 +19,82 @@ int test (int *in) {
   return 5/littleX.zero; // no-warning
 }
 
+
+class Base {};
+class Derived : public Base {};
+
+void checkPolymorphicUse() {
+  Derived d[10];
+
+  Base *p = d;
+  ++p; // expected-warning{{Pointer arithmetic on a pointer to base class is dangerous}}
+}
+
+void checkBitCasts() {
+  long l;
+  char *p = (char*)&l;
+  p = p+2;
+}
+
+void checkBasicarithmetic(int i) {
+  int t[10];
+  int *p = t;
+  ++p;
+  int a = 5;
+  p = &a;
+  ++p; // expected-warning{{Pointer arithmetic on non-array variables relies on memory layout, which is dangerous}}
+  p = p + 2; // expected-warning{{}}
+  p = 2 + p; // expected-warning{{}}
+  p += 2; // expected-warning{{}}
+  a += p[2]; // expected-warning{{}}
+  p = i*0 + p;
+  p = p + i*0;
+  p += i*0;
+}
+
+void checkArithOnSymbolic(int*p) {
+  ++p;
+  p = p + 2;
+  p = 2 + p;
+  p += 2;
+  (void)p[2];
+}
+
+struct S {
+  int t[10];
+};
+
+void arrayInStruct() {
+  S s;
+  int * p = s.t;
+  ++p;
+  S *sp = new S;
+  p = sp->t;
+  ++p;
+  delete sp;
+}
+
+void checkNew() {
+  int *p = new int;
+  p[1] = 1; // expected-warning{{}}
+}
+
+void InitState(int* state) {
+    state[1] = 1; // expected-warning{{}}
+}
+
+int* getArray(int size) {
+    if (size == 0)
+      return new int;
+    return new int[5];
+}
+
+void checkConditionalArray() {
+    int* maybeArray = getArray(0);
+    InitState(maybeArray);
+}
+
+void checkMultiDimansionalArray() {
+  int a[5][5];
+   *(*(a+1)+2) = 2;
+}
diff --git a/test/Analysis/rdar-6442306-1.m b/test/Analysis/rdar-6442306-1.m
index 0fb49c2a9b2fd..31a300c1f66fa 100644
--- a/test/Analysis/rdar-6442306-1.m
+++ b/test/Analysis/rdar-6442306-1.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -analyze -analyzer-checker=core,alpha.core %s -analyzer-store=region -verify
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,alpha.core -analyzer-disable-checker=alpha.core.PointerArithm %s -analyzer-store=region -verify
 // expected-no-diagnostics
 
 typedef int bar_return_t;
diff --git a/test/Analysis/stackaddrleak.c b/test/Analysis/stackaddrleak.c
index 21a15d75eca0a..717f30964acd5 100644
--- a/test/Analysis/stackaddrleak.c
+++ b/test/Analysis/stackaddrleak.c
@@ -19,7 +19,7 @@ void f2() {
   p = (const char *) __builtin_alloca(12);
 } // expected-warning{{Address of stack memory allocated by call to alloca() on line 19 is still referred to by the global variable 'p' upon returning to the caller.  This will be a dangling reference}}
 
-// PR 7383 - previosly the stack address checker would crash on this example
+// PR 7383 - previously the stack address checker would crash on this example
 //  because it would attempt to do a direct load from 'pr7383_list'. 
 static int pr7383(__const char *__)
 {
@@ -33,7 +33,7 @@ void test_multi_return() {
   int x;
   a = &x;
   b = &x;
-} // expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the global variable 'a' upon returning}} expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the global variable 'b' upon returning}}
+} // expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the static variable 'a' upon returning}} expected-warning{{Address of stack memory associated with local variable 'x' is still referred to by the static variable 'b' upon returning}}
 
 intptr_t returnAsNonLoc() {
   int x;
diff --git a/test/Analysis/string.c b/test/Analysis/string.c
index 9fd3efb5c2d77..2803362ba43e9 100644
--- a/test/Analysis/string.c
+++ b/test/Analysis/string.c
@@ -680,6 +680,18 @@ void strncat_empty() {
 #define strcmp BUILTIN(strcmp)
 int strcmp(const char * s1, const char * s2);
 
+void strcmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcmp(x, y) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strcmp(y, x) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(y, x) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcmp(y, x) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strcmp_constant0() {
   clang_analyzer_eval(strcmp("123", "123") == 0); // expected-warning{{TRUE}}
 }
@@ -703,13 +715,13 @@ void strcmp_0() {
 void strcmp_1() {
   char *x = "234";
   char *y = "123";
-  clang_analyzer_eval(strcmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_2() {
   char *x = "123";
   char *y = "234";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_null_0() {
@@ -727,25 +739,25 @@ void strcmp_null_1() {
 void strcmp_diff_length_0() {
   char *x = "12345";
   char *y = "234";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_1() {
   char *x = "123";
   char *y = "23456";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_2() {
   char *x = "12345";
   char *y = "123";
-  clang_analyzer_eval(strcmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_diff_length_3() {
   char *x = "123";
   char *y = "12345";
-  clang_analyzer_eval(strcmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcmp_embedded_null () {
@@ -756,6 +768,20 @@ void strcmp_unknown_arg (char *unknown) {
 	clang_analyzer_eval(strcmp(unknown, unknown) == 0); // expected-warning{{TRUE}}
 }
 
+union argument {
+   char *f;
+};
+
+void function_pointer_cast_helper(char **a) {
+  strcmp("Hi", *a); // PR24951 crash
+}
+
+void strcmp_union_function_pointer_cast(union argument a) {
+  void (*fPtr)(union argument *) = (void (*)(union argument *))function_pointer_cast_helper;
+
+  fPtr(&a);
+}
+
 //===----------------------------------------------------------------------===
 // strncmp()
 //===----------------------------------------------------------------------===
@@ -763,6 +789,18 @@ void strcmp_unknown_arg (char *unknown) {
 #define strncmp BUILTIN(strncmp)
 int strncmp(const char *s1, const char *s2, size_t n);
 
+void strncmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strncmp(x, y, 2) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 2) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncmp(x, y, 2) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strncmp(y, x, 2) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(y, x, 2) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncmp(y, x, 2) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strncmp_constant0() {
   clang_analyzer_eval(strncmp("123", "123", 3) == 0); // expected-warning{{TRUE}}
 }
@@ -786,13 +824,13 @@ void strncmp_0() {
 void strncmp_1() {
   char *x = "234";
   char *y = "123";
-  clang_analyzer_eval(strncmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_2() {
   char *x = "123";
   char *y = "234";
-  clang_analyzer_eval(strncmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_null_0() {
@@ -810,25 +848,25 @@ void strncmp_null_1() {
 void strncmp_diff_length_0() {
   char *x = "12345";
   char *y = "234";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_1() {
   char *x = "123";
   char *y = "23456";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_2() {
   char *x = "12345";
   char *y = "123";
-  clang_analyzer_eval(strncmp(x, y, 5) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_3() {
   char *x = "123";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_4() {
@@ -840,13 +878,13 @@ void strncmp_diff_length_4() {
 void strncmp_diff_length_5() {
   char *x = "012";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_diff_length_6() {
   char *x = "234";
   char *y = "12345";
-  clang_analyzer_eval(strncmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncmp_embedded_null () {
@@ -860,6 +898,18 @@ void strncmp_embedded_null () {
 #define strcasecmp BUILTIN(strcasecmp)
 int strcasecmp(const char *s1, const char *s2);
 
+void strcasecmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strcasecmp(y, x) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(y, x) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strcasecmp(y, x) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strcasecmp_constant0() {
   clang_analyzer_eval(strcasecmp("abc", "Abc") == 0); // expected-warning{{TRUE}}
 }
@@ -883,13 +933,13 @@ void strcasecmp_0() {
 void strcasecmp_1() {
   char *x = "Bcd";
   char *y = "abc";
-  clang_analyzer_eval(strcasecmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_2() {
   char *x = "abc";
   char *y = "Bcd";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_null_0() {
@@ -907,25 +957,25 @@ void strcasecmp_null_1() {
 void strcasecmp_diff_length_0() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_1() {
   char *x = "abc";
   char *y = "aBdef";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_2() {
   char *x = "aBcDe";
   char *y = "abc";
-  clang_analyzer_eval(strcasecmp(x, y) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) > 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_diff_length_3() {
   char *x = "aBc";
   char *y = "abcde";
-  clang_analyzer_eval(strcasecmp(x, y) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strcasecmp(x, y) < 0); // expected-warning{{TRUE}}
 }
 
 void strcasecmp_embedded_null () {
@@ -939,6 +989,18 @@ void strcasecmp_embedded_null () {
 #define strncasecmp BUILTIN(strncasecmp)
 int strncasecmp(const char *s1, const char *s2, size_t n);
 
+void strncasecmp_check_modelling() {
+  char *x = "aa";
+  char *y = "a";
+  clang_analyzer_eval(strncasecmp(x, y, 2) > 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 2) <= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncasecmp(x, y, 2) > 1); // expected-warning{{UNKNOWN}}
+
+  clang_analyzer_eval(strncasecmp(y, x, 2) < 0); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(y, x, 2) >= 0); // expected-warning{{FALSE}}
+  clang_analyzer_eval(strncasecmp(y, x, 2) < -1); // expected-warning{{UNKNOWN}}
+}
+
 void strncasecmp_constant0() {
   clang_analyzer_eval(strncasecmp("abc", "Abc", 3) == 0); // expected-warning{{TRUE}}
 }
@@ -962,13 +1024,13 @@ void strncasecmp_0() {
 void strncasecmp_1() {
   char *x = "Bcd";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_2() {
   char *x = "abc";
   char *y = "Bcd";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_null_0() {
@@ -986,25 +1048,25 @@ void strncasecmp_null_1() {
 void strncasecmp_diff_length_0() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_1() {
   char *x = "abc";
   char *y = "aBdef";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_2() {
   char *x = "aBcDe";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_3() {
   char *x = "aBc";
   char *y = "abcde";
-  clang_analyzer_eval(strncasecmp(x, y, 5) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 5) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_4() {
@@ -1016,13 +1078,13 @@ void strncasecmp_diff_length_4() {
 void strncasecmp_diff_length_5() {
   char *x = "abcde";
   char *y = "aBd";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == -1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) < 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_diff_length_6() {
   char *x = "aBDe";
   char *y = "abc";
-  clang_analyzer_eval(strncasecmp(x, y, 3) == 1); // expected-warning{{TRUE}}
+  clang_analyzer_eval(strncasecmp(x, y, 3) > 0); // expected-warning{{TRUE}}
 }
 
 void strncasecmp_embedded_null () {
diff --git a/test/Analysis/temp-obj-dtors-cfg-output.cpp b/test/Analysis/temp-obj-dtors-cfg-output.cpp
index dc10e875d3617..b425d9161101f 100644
--- a/test/Analysis/temp-obj-dtors-cfg-output.cpp
+++ b/test/Analysis/temp-obj-dtors-cfg-output.cpp
@@ -1077,7 +1077,7 @@ int testConsistencyNestedNormalReturn(bool value) {
 // CHECK:    14: a([B1.13]) (Member initializer)
 // CHECK:    15: ~B() (Temporary object destructor)
 // CHECK:    16: ~A() (Temporary object destructor)
-// CHECK:    17: /*implicit*/int()
+// CHECK:    17: /*implicit*/(int)0
 // CHECK:    18: b([B1.17]) (Member initializer)
 // CHECK:     Preds (1): B2
 // CHECK:     Succs (1): B0
diff --git a/test/Analysis/traversal-begin-end-function.c b/test/Analysis/traversal-begin-end-function.c
new file mode 100644
index 0000000000000..810ce1d2a529e
--- /dev/null
+++ b/test/Analysis/traversal-begin-end-function.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=core,debug.DumpTraversal %s | FileCheck %s
+
+void inline_callee(int i);
+
+// CHECK: --BEGIN FUNCTION--
+void inline_caller() {
+  // CHECK: --BEGIN FUNCTION--
+  // CHECK: --BEGIN FUNCTION--
+  // CHECK: --BEGIN FUNCTION--
+  inline_callee(3);
+  // CHECK: --END FUNCTION--
+  // CHECK: --END FUNCTION--
+  // CHECK: --END FUNCTION--
+}
+// CHECK: --END FUNCTION--
+
+void inline_callee(int i) {
+  if (i <= 1)
+    return;
+
+  inline_callee(i - 1);
+}
diff --git a/test/Analysis/traversal-path-unification.c b/test/Analysis/traversal-path-unification.c
index 83e3b87c2bb34..3bf6df731c227 100644
--- a/test/Analysis/traversal-path-unification.c
+++ b/test/Analysis/traversal-path-unification.c
@@ -11,6 +11,7 @@ int c();
 #define CHECK(x) (x)
 #endif
 
+// CHECK: --BEGIN FUNCTION--
 void testRemoveDeadBindings() {
   int i = a();
   if (CHECK(i))
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8dd64d1614ac2..f4be0adf14f85 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -26,10 +26,16 @@ endif ()
 
 list(APPEND CLANG_TEST_DEPS
   clang clang-headers
-  clang-check clang-format
+  clang-format
   c-index-test diagtool
   clang-tblgen
   )
+  
+if(CLANG_ENABLE_STATIC_ANALYZER)
+  list(APPEND CLANG_TEST_DEPS
+    clang-check
+    )
+endif()
 
 if (CLANG_ENABLE_ARCMT)
   list(APPEND CLANG_TEST_DEPS
@@ -40,6 +46,7 @@ endif ()
 
 if (ENABLE_CLANG_EXAMPLES)
   list(APPEND CLANG_TEST_DEPS
+    AnnotateFunctions
     clang-interpreter
     PrintFunctionNames
     )
@@ -61,16 +68,22 @@ if( NOT CLANG_BUILT_STANDALONE )
     FileCheck count not
     llc
     llvm-bcanalyzer
-    llvm-lto
+    llvm-nm
     llvm-objdump
     llvm-profdata
     llvm-readobj
     llvm-symbolizer
-    LTO
     opt
     )
+
+  if(TARGET llvm-lto)
+    list(APPEND CLANG_TEST_DEPS llvm-lto)
+  endif()
 endif()
 
+add_custom_target(clang-test-depends DEPENDS ${CLANG_TEST_DEPS})
+set_target_properties(clang-test-depends PROPERTIES FOLDER "Clang tests")
+
 add_lit_testsuite(check-clang "Running the Clang regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   #LIT ${LLVM_LIT}
@@ -80,6 +93,11 @@ add_lit_testsuite(check-clang "Running the Clang regression tests"
   )
 set_target_properties(check-clang PROPERTIES FOLDER "Clang tests")
 
+add_lit_testsuites(CLANG ${CMAKE_CURRENT_SOURCE_DIR}
+  PARAMS ${CLANG_TEST_PARAMS}
+  DEPENDS ${CLANG_TEST_DEPS}
+)
+
 # Add a legacy target spelling: clang-test
 add_custom_target(clang-test)
 add_dependencies(clang-test check-clang)
diff --git a/test/CXX/basic/basic.def/p2.cpp b/test/CXX/basic/basic.def/p2.cpp
new file mode 100644
index 0000000000000..598a79a8a3d7d
--- /dev/null
+++ b/test/CXX/basic/basic.def/p2.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -Wdeprecated
+
+namespace {
+  struct A {
+    static constexpr int n = 0;
+  };
+  const int A::n; // expected-warning {{deprecated}}
+}
diff --git a/test/CXX/basic/basic.def/p4.cpp b/test/CXX/basic/basic.def/p4.cpp
new file mode 100644
index 0000000000000..c3919156bbbbf
--- /dev/null
+++ b/test/CXX/basic/basic.def/p4.cpp
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+inline int f(); // expected-warning {{inline function 'f' is not defined}}
+extern inline int n; // expected-error {{inline variable 'n' is not defined}}
+
+int use = f() + n; // expected-note 2{{used here}}
diff --git a/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp b/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
index c20728332704a..bb6bb73ec7029 100644
--- a/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
+++ b/test/CXX/basic/basic.lookup/basic.lookup.classref/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-show-option -verify -std=c++11 %s
 
 // C++98 [basic.lookup.classref]p1:
 //   In a class member access expression (5.2.5), if the . or -> token is
@@ -21,10 +23,16 @@
 
 // From PR 7247
 template<typename T>
-struct set{};  // expected-note{{lookup from the current scope refers here}}
+struct set{};
+#if __cplusplus <= 199711L
+// expected-note@-2 {{lookup from the current scope refers here}}
+#endif
 struct Value {
   template<typename T>
-  void set(T value) {}  // expected-note{{lookup in the object type 'Value' refers here}}
+  void set(T value) {}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup in the object type 'Value' refers here}}
+#endif
 
   void resolves_to_same() {
     Value v;
@@ -36,7 +44,10 @@ void resolves_to_different() {
     Value v;
     // The fact that the next line is a warning rather than an error is an
     // extension.
-    v.set<double>(3.2);  // expected-warning{{lookup of 'set' in member access expression is ambiguous; using member of 'Value'}}
+    v.set<double>(3.2);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{lookup of 'set' in member access expression is ambiguous; using member of 'Value'}}
+#endif
   }
   {
     int set;  // Non-template.
diff --git a/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp b/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
index d1562d4cd18b3..f32b239765475 100644
--- a/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
+++ b/test/CXX/basic/basic.lookup/basic.lookup.qual/class.qual/p2.cpp
@@ -53,16 +53,17 @@ namespace InhCtor {
   int n = b.T(); // expected-error {{'T' is a protected member of 'InhCtor::A'}}
                  // expected-note@-15 {{declared protected here}}
 
+  // FIXME: EDG and GCC reject this too, but it's not clear why it would be
+  // ill-formed.
   template<typename T>
   struct S : T {
-    struct U : S {
+    struct U : S { // expected-note 6{{candidate}}
       using S::S;
     };
     using T::T;
   };
-
-  S<A>::U ua(0);
-  S<B>::U ub(0);
+  S<A>::U ua(0); // expected-error {{no match}}
+  S<B>::U ub(0); // expected-error {{no match}}
 
   template<typename T>
   struct X : T {
diff --git a/test/CXX/basic/basic.types/p10.cpp b/test/CXX/basic/basic.types/p10.cpp
index 19258f80f517d..31ef6b62ceadd 100644
--- a/test/CXX/basic/basic.types/p10.cpp
+++ b/test/CXX/basic/basic.types/p10.cpp
@@ -141,3 +141,45 @@ constexpr int arb(int n) {
 }
 constexpr long Overflow[ // expected-error {{constexpr variable cannot have non-literal type 'long const[(1 << 30) << 2]'}}
     (1 << 30) << 2]{};   // expected-warning {{requires 34 bits to represent}}
+
+namespace inherited_ctor {
+  struct A { constexpr A(int); };
+  struct B : A {
+    B();
+    using A::A;
+  };
+  constexpr int f(B) { return 0; } // ok
+
+  struct C { constexpr C(int); };
+  struct D : C { // expected-note {{because}}
+    D(int);
+    using C::C;
+  };
+  constexpr int f(D) { return 0; } // expected-error {{not a literal type}}
+
+  // This one is a bit odd: F inherits E's default constructor, which is
+  // constexpr. Because F has a constructor of its own, it doesn't declare a
+  // default constructor hiding E's one.
+  struct E {};
+  struct F : E {
+    F(int);
+    using E::E;
+  };
+  constexpr int f(F) { return 0; }
+
+  // FIXME: Is this really the right behavior? We presumably should be checking
+  // whether the inherited constructor would be a copy or move constructor for
+  // the derived class, not for the base class.
+  struct G { constexpr G(const G&); };
+  struct H : G { // expected-note {{because}}
+    using G::G;
+  };
+  constexpr int f(H) { return 0; } // expected-error {{not a literal type}}
+
+  struct J;
+  struct I { constexpr I(const J&); };
+  struct J : I {
+    using I::I;
+  };
+  constexpr int f(J) { return 0; }
+}
diff --git a/test/CXX/class.access/class.access.dcl/p1.cpp b/test/CXX/class.access/class.access.dcl/p1.cpp
index aab5fff5ea3f7..118ab9e52d0a1 100644
--- a/test/CXX/class.access/class.access.dcl/p1.cpp
+++ b/test/CXX/class.access/class.access.dcl/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // This is just the test for [namespace.udecl]p4 with 'using'
 // uniformly stripped out.
@@ -24,10 +26,33 @@ namespace test0 {
   }
 
   class Test0 {
-    NonClass::type; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::hiding; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::union_member; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    NonClass::enumerator; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
+    NonClass::type; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::hiding; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::union_member; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    NonClass::enumerator; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
   };
 }
 
@@ -43,11 +68,39 @@ namespace test1 {
   };
 
   struct B : A {
-    A::type; // expected-warning {{access declarations are deprecated}}
-    A::hiding; // expected-warning {{access declarations are deprecated}}
-    A::union_member; // expected-warning {{access declarations are deprecated}}
-    A::enumerator; // expected-warning {{access declarations are deprecated}}
-    A::tagname; // expected-warning {{access declarations are deprecated}}
+    A::type;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+    A::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::tagname;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     void test0() {
       type t = 0;
@@ -86,11 +139,40 @@ namespace test2 {
   };
 
   template <class T> struct B : A {
-    A::type; // expected-warning {{access declarations are deprecated}}
-    A::hiding; // expected-warning {{access declarations are deprecated}}
-    A::union_member; // expected-warning {{access declarations are deprecated}}
-    A::enumerator; // expected-warning {{access declarations are deprecated}}
-    A::tagname; // expected-warning {{access declarations are deprecated}}
+    A::type;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A::tagname;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     void test0() {
       type t = 0;
@@ -131,11 +213,40 @@ namespace test3 {
   };
 
   template <class T> struct B : A<T> {
-    A<T>::type; // expected-error {{dependent using declaration resolved to type without 'typename'}} // expected-warning {{access declarations are deprecated}}
-    A<T>::hiding; // expected-warning {{access declarations are deprecated}}
-    A<T>::union_member; // expected-warning {{access declarations are deprecated}}
-    A<T>::enumerator; // expected-warning {{access declarations are deprecated}}
-    A<T>::tagname; // expected-error {{dependent using declaration resolved to type without 'typename'}} // expected-warning {{access declarations are deprecated}}
+    A<T>::type; // expected-error {{dependent using declaration resolved to type without 'typename'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::hiding;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::union_member;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::enumerator;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    A<T>::tagname; // expected-error {{dependent using declaration resolved to type without 'typename'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
 
     // FIXME: re-enable these when the various bugs involving tags are fixed
 #if 0
@@ -186,14 +297,54 @@ namespace test4 {
 
   // We should be able to diagnose these without instantiation.
   template <class T> struct C : Base {
-    InnerNS::foo; // expected-error {{not a class}} expected-warning {{access declarations are deprecated}}
-    Base::bar; // expected-error {{no member named 'bar'}} expected-warning {{access declarations are deprecated}}
-    Unrelated::foo; // expected-error {{not a base class}} expected-warning {{access declarations are deprecated}}
-    C::foo; // legal in C++03 // expected-warning {{access declarations are deprecated}}
-    Subclass::foo; // legal in C++03 // expected-warning {{access declarations are deprecated}}
-
-    int bar(); //expected-note {{target of using declaration}}
-    C::bar; // expected-error {{refers to its own class}} expected-warning {{access declarations are deprecated}}
+    InnerNS::foo; // expected-error {{not a class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    Base::bar; // expected-error {{no member named 'bar'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    Unrelated::foo; // expected-error {{not a base class}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+
+    C::foo; // legal in C++03
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+    // expected-error@-5 {{using declaration refers to its own class}}
+#endif
+
+    Subclass::foo; // legal in C++03
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+    // expected-error@-5 {{using declaration refers into 'Subclass::', which is not a base class of 'C'}}
+#endif
+
+    int bar();
+#if __cplusplus <= 199711L
+    //expected-note@-2 {{target of using declaration}}
+#endif
+    C::bar;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{access declarations are deprecated; use using declarations instead}}
+#else
+    // expected-error@-4 {{ISO C++11 does not allow access declarations; use using declarations instead}}
+#endif
+    // expected-error@-6 {{using declaration refers to its own class}}
   };
 }
 
diff --git a/test/CXX/class/class.friend/p1.cpp b/test/CXX/class/class.friend/p1.cpp
index b83dfa36cd351..037fc3dadb964 100644
--- a/test/CXX/class/class.friend/p1.cpp
+++ b/test/CXX/class/class.friend/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct Outer {
   struct Inner {
@@ -41,7 +43,10 @@ class A {
   UndeclaredSoFar x; // expected-error {{unknown type name 'UndeclaredSoFar'}}
 
   void a_member();
-  friend void A::a_member(); // expected-error {{friends cannot be members of the declaring class}}
+  friend void A::a_member();
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{friends cannot be members of the declaring class}}
+#endif
   friend void a_member(); // okay (because we ignore class scopes when looking up friends)
   friend class A::AInner; // this is okay as an extension
   friend class AInner; // okay, refers to ::AInner
diff --git a/test/CXX/class/class.friend/p2.cpp b/test/CXX/class/class.friend/p2.cpp
index fb3cd19b2b182..e4a46b30e7888 100644
--- a/test/CXX/class/class.friend/p2.cpp
+++ b/test/CXX/class/class.friend/p2.cpp
@@ -1,10 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct B0;
 
 class A {
   friend class B {}; // expected-error {{cannot define a type in a friend declaration}}
-  friend int; // expected-warning {{non-class friend type 'int' is a C++11 extension}}
-  friend B0; // expected-warning {{specify 'struct' to befriend 'B0'}}
+  friend int;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{non-class friend type 'int' is a C++11 extension}}
+#endif
+  friend B0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'struct' to befriend 'B0'}}
+#endif
   friend class C; // okay
 };
diff --git a/test/CXX/class/class.static/class.static.data/p2.cpp b/test/CXX/class/class.static/class.static.data/p2.cpp
new file mode 100644
index 0000000000000..8c38276641d41
--- /dev/null
+++ b/test/CXX/class/class.static/class.static.data/p2.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+struct X {
+  static struct A a;
+  static inline struct B b; // expected-error {{incomplete type}} expected-note {{forward decl}}
+  static inline struct C c = {}; // expected-error {{incomplete type}} expected-note {{forward decl}}
+};
diff --git a/test/CXX/class/class.static/class.static.data/p3.cpp b/test/CXX/class/class.static/class.static.data/p3.cpp
index 1607bac80293f..413017d133518 100644
--- a/test/CXX/class/class.static/class.static.data/p3.cpp
+++ b/test/CXX/class/class.static/class.static.data/p3.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
 
 struct NonLit { // expected-note 3{{no constexpr constructors}}
   NonLit();
@@ -6,7 +7,7 @@ struct NonLit { // expected-note 3{{no constexpr constructors}}
 
 struct S {
   static constexpr int a = 0;
-  static constexpr int b; // expected-error {{declaration of constexpr static data member 'b' requires an initializer}}
+  static constexpr int b; // expected-error {{initializ}} expected-note 0-1{{previous}}
 
   static constexpr int c = 0;
   static const int d;
@@ -16,19 +17,27 @@ struct S {
   static const double f = 0.0; // expected-error {{requires 'constexpr' specifier}} expected-note {{add 'constexpr'}}
   static char *const g = 0; // expected-error {{requires 'constexpr' specifier}}
   static const NonLit h = NonLit(); // expected-error {{must be initialized out of line}}
+
+  static inline int i; // expected-note {{previous}} expected-warning 0-1{{extension}}
+  static inline int j; // expected-note {{previous}} expected-warning 0-1{{extension}}
+  static constexpr int k = 0;
 };
 
 constexpr int S::a;
-constexpr int S::b = 0;
+constexpr int S::b = 0; // expected-error 0-1{{redefinition}}
 
 const int S::c;
 constexpr int S::d = 0;
 constexpr int S::d2;
 
+int S::i; // expected-error {{redefinition}}
+int S::j; // expected-error {{redefinition}}
+const int S::k; // ok (deprecated)
+
 template<typename T>
 struct U {
   static constexpr int a = 0;
-  static constexpr int b; // expected-error {{declaration of constexpr static data member 'b' requires an initializer}}
+  static constexpr int b; // expected-error {{initializ}}
   static constexpr NonLit h = NonLit(); // expected-error {{cannot have non-literal type 'const NonLit'}}
   static constexpr T c = T(); // expected-error {{cannot have non-literal type}}
   static const T d;
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
index ded6ed013c053..3baf238241a6f 100644
--- a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p1.cpp
@@ -41,3 +41,20 @@ typedef concept int CI; // expected-error {{'concept' can only appear on the def
 void fpc(concept int i) {} // expected-error {{'concept' can only appear on the definition of a function template or variable template}}
 
 concept bool; // expected-error {{'concept' can only appear on the definition of a function template or variable template}}
+
+template <typename T> concept bool VCEI{ true };
+template concept bool VCEI<int>; // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+extern template concept bool VCEI<int>; // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+
+template <typename T> concept bool VCPS{ true };
+template <typename T> concept bool VCPS<T *>{ true }; // expected-error {{'concept' cannot be applied on an partial specialization}}
+
+template <typename T> concept bool VCES{ true };
+template <> concept bool VCES<int>{ true }; // expected-error {{'concept' cannot be applied on an explicit specialization}}
+
+template <typename T> concept bool FCEI() { return true; }
+template concept bool FCEI<int>(); // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+extern template concept bool FCEI<int>(); // expected-error {{'concept' cannot be applied on an explicit instantiation}}
+
+template <typename T> concept bool FCES() { return true; }
+template <> concept bool FCES<bool>() { return true; } // expected-error {{'concept' cannot be applied on an explicit specialization}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
index 38593bccaf82f..69672ca830631 100644
--- a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p5.cpp
@@ -11,3 +11,15 @@ concept bool fcpp(Ts... ts) { return true; } // expected-error {{function concep
 
 template<typename T>
 concept bool fcpva(...) { return true; } // expected-error {{function concept cannot have any parameters}}
+
+template<typename T>
+concept const bool fcrtc() { return true; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept int fcrti() { return 5; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept float fcrtf() { return 5.5; } // expected-error {{declared return type of function concept must be 'bool'}}
+
+template<typename T>
+concept decltype(auto) fcrtd(void) { return true; } // expected-error {{declared return type of function concept must be 'bool'}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp
new file mode 100644
index 0000000000000..f8a1bb72e39a6
--- /dev/null
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p6.cpp
@@ -0,0 +1,25 @@
+// RUN:  %clang_cc1 -std=c++14 -fconcepts-ts -x c++ -verify %s
+
+template<typename T>
+concept bool vc { true };
+
+template<typename T>
+struct B { typedef bool Boolean; };
+
+template<int N>
+B<void>::Boolean concept vctb(!0);
+
+template<typename T>
+concept const bool vctc { true }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept int vcti { 5 }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept float vctf { 5.5 }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept auto vcta { true }; // expected-error {{declared type of variable concept must be 'bool'}}
+
+template<typename T>
+concept decltype(auto) vctd { true }; // expected-error {{declared type of variable concept must be 'bool'}}
diff --git a/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp
new file mode 100644
index 0000000000000..1bad6bb93294f
--- /dev/null
+++ b/test/CXX/concepts-ts/dcl.dcl/dcl.spec/dcl.spec.concept/p7.cpp
@@ -0,0 +1,18 @@
+// RUN:  %clang_cc1 -std=c++14 -fconcepts-ts -x c++ -verify %s
+
+template <typename T> concept bool FCEI() { return true; } // expected-note {{previous declaration is here}} expected-note {{previous declaration is here}}
+template bool FCEI<int>(); // expected-error {{function concept cannot be explicitly instantiated}}
+extern template bool FCEI<double>(); // expected-error {{function concept cannot be explicitly instantiated}}
+
+template <typename T> concept bool FCES() { return true; } // expected-note {{previous declaration is here}}
+template <> bool FCES<int>() { return true; } // expected-error {{function concept cannot be explicitly specialized}}
+
+template <typename T> concept bool VC { true }; // expected-note {{previous declaration is here}} expected-note {{previous declaration is here}}
+template bool VC<int>; // expected-error {{variable concept cannot be explicitly instantiated}}
+extern template bool VC<double>; // expected-error {{variable concept cannot be explicitly instantiated}}
+
+template <typename T> concept bool VCES { true }; // expected-note {{previous declaration is here}}
+template <> bool VCES<int> { true }; // expected-error {{variable concept cannot be explicitly specialized}}
+
+template <typename T> concept bool VCPS { true }; // expected-note {{previous declaration is here}}
+template <typename T> bool VCPS<T *> { true }; // expected-error {{variable concept cannot be partially specialized}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
index cc28bf6c28c4c..ce43720cb2d3e 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p12.cpp
@@ -1,3 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
 // C++03 [namespace.udecl]p12:
@@ -161,3 +163,33 @@ namespace test4 {
     d.bar<int>(3); // expected-error {{'bar' is a protected member}}
   }
 }
+
+namespace test5 {
+  struct Derived;
+  struct Base {
+    void operator=(const Derived&);
+  };
+  struct Derived : Base {
+    // Hidden by implicit derived class operator.
+    using Base::operator=;
+  };
+  void f(Derived d) {
+    d = d;
+  }
+}
+
+#if __cplusplus >= 201103L
+namespace test6 {
+  struct Derived;
+  struct Base {
+    void operator=(Derived&&);
+  };
+  struct Derived : Base {
+    // Hidden by implicit derived class operator.
+    using Base::operator=;
+  };
+  void f(Derived d) {
+    d = Derived();
+  }
+}
+#endif
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
new file mode 100644
index 0000000000000..3e04d5094ac19
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p15.cpp
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct B1 { // expected-note 2{{candidate}}
+  B1(int); // expected-note {{candidate}}
+};
+
+struct B2 { // expected-note 2{{candidate}}
+  B2(int); // expected-note {{candidate}}
+};
+
+struct D1 : B1, B2 { // expected-note 2{{candidate}}
+  using B1::B1; // expected-note 3{{inherited here}}
+  using B2::B2; // expected-note 3{{inherited here}}
+};
+D1 d1(0); // expected-error {{ambiguous}}
+
+struct D2 : B1, B2 {
+  using B1::B1;
+  using B2::B2;
+  D2(int);
+};
+D2 d2(0); // ok
+
+
+// The emergent behavior of implicit special members is a bit odd when
+// inheriting from multiple base classes.
+namespace default_ctor {
+  struct C;
+  struct D;
+
+  struct A { // expected-note 4{{candidate}}
+    A(); // expected-note {{candidate}}
+
+    A(C &&); // expected-note {{candidate}}
+    C &operator=(C&&); // expected-note {{candidate}}
+
+    A(D &&); // expected-note {{candidate}}
+    D &operator=(D&&); // expected-note {{candidate}}
+  };
+
+  struct B { // expected-note 4{{candidate}}
+    B(); // expected-note {{candidate}}
+
+    B(C &&); // expected-note {{candidate}}
+    C &operator=(C&&); // expected-note {{candidate}}
+
+    B(D &&); // expected-note {{candidate}}
+    D &operator=(D&&); // expected-note {{candidate}}
+  };
+
+  struct C : A, B {
+    using A::A;
+    using A::operator=;
+    using B::B;
+    using B::operator=;
+  };
+  struct D : A, B {
+    using A::A; // expected-note 5{{inherited here}}
+    using A::operator=;
+    using B::B; // expected-note 5{{inherited here}}
+    using B::operator=;
+
+    D(int);
+    D(const D&); // expected-note {{candidate}}
+    D &operator=(const D&); // expected-note {{candidate}}
+  };
+
+  C c;
+  void f(C c) {
+    C c2(static_cast<C&&>(c));
+    c = static_cast<C&&>(c);
+  }
+
+  // D does not declare D(), D(D&&), nor operator=(D&&), so the base class
+  // versions are inherited.
+  D d; // expected-error {{ambiguous}}
+  void f(D d) {
+    D d2(static_cast<D&&>(d)); // expected-error {{ambiguous}}
+    d = static_cast<D&&>(d); // expected-error {{ambiguous}}
+  }
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp
new file mode 100644
index 0000000000000..b9fca4bd5b616
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p18.cpp
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct Public {} public_;
+struct Protected {} protected_;
+struct Private {} private_;
+
+class A {
+public:
+  A(Public);
+  void f(Public);
+
+protected:
+  A(Protected); // expected-note {{protected here}}
+  void f(Protected);
+
+private:
+  A(Private); // expected-note 4{{private here}}
+  void f(Private); // expected-note {{private here}}
+
+  friend void Friend();
+};
+
+class B : private A {
+  using A::A; // ok
+  using A::f; // expected-error {{private member}}
+
+  void f() {
+    B a(public_);
+    B b(protected_);
+    B c(private_); // expected-error {{private}}
+  }
+
+  B(Public p, int) : B(p) {}
+  B(Protected p, int) : B(p) {}
+  B(Private p, int) : B(p) {} // expected-error {{private}}
+};
+
+class C : public B {
+  C(Public p) : B(p) {}
+  // There is no access check on the conversion from derived to base here;
+  // protected constructors of A act like protected constructors of B.
+  C(Protected p) : B(p) {}
+  C(Private p) : B(p) {} // expected-error {{private}}
+};
+
+void Friend() {
+  // There is no access check on the conversion from derived to base here.
+  B a(public_);
+  B b(protected_);
+  B c(private_);
+}
+
+void NonFriend() {
+  B a(public_);
+  B b(protected_); // expected-error {{protected}}
+  B c(private_); // expected-error {{private}}
+}
+
+namespace ProtectedAccessFromMember {
+namespace a {
+  struct ES {
+  private:
+    ES(const ES &) = delete;
+  protected:
+    ES(const char *);
+  };
+}
+namespace b {
+  struct DES : a::ES {
+    DES *f();
+  private:
+    using a::ES::ES;
+  };
+}
+b::DES *b::DES::f() { return new b::DES("foo"); }
+
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp
deleted file mode 100644
index f61437ead6e49..0000000000000
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3-cxx0x.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
-// C++0x N2914.
-
-struct B {
-  void f(char);
-  void g(char);
-  enum E { e };
-  union { int x; };
-};
-
-class C {
-  int g();
-};
-
-class D2 : public B {
-  using B::f;
-  using B::e;
-  using B::x;
-  using C::g; // expected-error{{using declaration refers into 'C::', which is not a base class of 'D2'}}
-};
-
-namespace test1 {
-  struct Base {
-    int foo();
-  };
-
-  struct Unrelated {
-    int foo();
-  };
-
-  struct Subclass : Base {
-  };
-
-  namespace InnerNS {
-    int foo();
-  }
-
-  // We should be able to diagnose these without instantiation.
-  template <class T> struct C : Base {
-    using InnerNS::foo; // expected-error {{not a class}}
-    using Base::bar; // expected-error {{no member named 'bar'}}
-    using Unrelated::foo; // expected-error {{not a base class}}
-    using C::foo; // expected-error {{refers to its own class}}
-    using Subclass::foo; // expected-error {{not a base class}}
-  };
-}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp
new file mode 100644
index 0000000000000..6c505a55c2afc
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p3.cpp
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+struct B {
+  void f(char);
+  void g(char);
+  enum E { e };
+  union { int x; };
+
+  enum class EC { ec }; // expected-warning 0-1 {{C++11}}
+
+  void f2(char);
+  void g2(char);
+  enum E2 { e2 };
+  union { int x2; };
+};
+
+class C {
+  int g();
+};
+
+struct D : B {};
+
+class D2 : public B {
+  using B::f;
+  using B::E;
+  using B::e;
+  using B::x;
+  using C::g; // expected-error{{using declaration refers into 'C::', which is not a base class of 'D2'}}
+
+  // These are valid in C++98 but not in C++11.
+  using D::f2;
+  using D::E2;
+  using D::e2;
+  using D::x2;
+#if __cplusplus >= 201103L
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+  // expected-error@-5 {{using declaration refers into 'D::', which is not a base class of 'D2'}}
+#endif
+
+  using B::EC;
+  using B::EC::ec; // expected-error {{not a class}} expected-warning 0-1 {{C++11}}
+};
+
+namespace test1 {
+  struct Base {
+    int foo();
+  };
+
+  struct Unrelated {
+    int foo();
+  };
+
+  struct Subclass : Base {
+  };
+
+  namespace InnerNS {
+    int foo();
+  }
+
+  struct B : Base {
+  };
+
+  // We should be able to diagnose these without instantiation.
+  template <class T> struct C : Base {
+    using InnerNS::foo; // expected-error {{not a class}}
+    using Base::bar; // expected-error {{no member named 'bar'}}
+    using Unrelated::foo; // expected-error {{not a base class}}
+
+    // In C++98, it's hard to see that these are invalid, because indirect
+    // references to base class members are permitted.
+    using C::foo;
+    using Subclass::foo;
+#if __cplusplus >= 201103L
+    // expected-error@-3 {{refers to its own class}}
+    // expected-error@-3 {{not a base class}}
+#endif
+  };
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
index a43d9e019ed33..781a1a1824e93 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p4.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // C++03 [namespace.udecl]p4:
 //   A using-declaration used as a member-declaration shall refer to a
@@ -206,8 +207,33 @@ namespace test4 {
     using Unrelated::foo; // expected-error {{not a base class}}
     using C::foo; // legal in C++03
     using Subclass::foo; // legal in C++03
+#if __cplusplus >= 201103L
+    // expected-error@-3 {{refers to its own class}}
+    // expected-error@-3 {{refers into 'Subclass::', which is not a base class}}
+#endif
 
-    int bar(); //expected-note {{target of using declaration}}
+    int bar();
+#if __cplusplus < 201103L
+    // expected-note@-2 {{target of using declaration}}
+#endif
     using C::bar; // expected-error {{refers to its own class}}
   };
 }
+
+namespace test5 {
+  struct B;
+  struct A {
+    A(const B&);
+    B &operator=(const B&);
+  };
+  struct B : A {
+#if __cplusplus >= 201103L
+    using A::A;
+#endif
+    using A::operator=;
+  };
+  void test(B b) {
+    B b2(b);
+    b2 = b;
+  }
+}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx11.cpp
index c2fb95902454d..97b2953b90312 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx0x.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p6-cxx11.cpp
@@ -1,8 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// C++0x N2914.
 
 namespace A {
   namespace B { }
 }
 
-using A::B; // expected-error{{using declaration cannot refer to namespace}}
+using A::B; // expected-error{{using declaration cannot refer to a namespace}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp
new file mode 100644
index 0000000000000..6c9379fac27fd
--- /dev/null
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p7.cpp
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+enum class EC { ec };
+using EC::ec; // expected-error {{using declaration cannot refer to a scoped enumerator}}
diff --git a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
index ebe5388d65acc..6c63f061ab0f1 100644
--- a/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
+++ b/test/CXX/dcl.dcl/basic.namespace/namespace.udecl/p8-cxx0x.cpp
@@ -7,14 +7,41 @@
 struct X {
   int i;
   static int a;
+  enum E { e };
 };
 
 using X::i; // expected-error{{using declaration cannot refer to class member}}
 using X::s; // expected-error{{using declaration cannot refer to class member}}
+using X::e; // expected-error{{using declaration cannot refer to class member}}
+using X::E::e; // expected-error{{using declaration cannot refer to class member}} expected-warning 0-1{{C++11}}
+#if __cplusplus < 201103L
+// expected-note@-3 {{use a const variable}}
+// expected-note@-3 {{use a const variable}}
+// CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+// CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+#else
+// expected-note@-8 {{use a constexpr variable}}
+// expected-note@-8 {{use a constexpr variable}}
+// CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:1-[[@LINE-10]]:6}:"constexpr auto e = "
+// CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:1-[[@LINE-10]]:6}:"constexpr auto e = "
+#endif
 
 void f() {
   using X::i; // expected-error{{using declaration cannot refer to class member}}
   using X::s; // expected-error{{using declaration cannot refer to class member}}
+  using X::e; // expected-error{{using declaration cannot refer to class member}}
+  using X::E::e; // expected-error{{using declaration cannot refer to class member}} expected-warning 0-1{{C++11}}
+#if __cplusplus < 201103L
+  // expected-note@-3 {{use a const variable}}
+  // expected-note@-3 {{use a const variable}}
+  // CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+  // CXX98-NOT: fix-it:"{{.*}}":{[[@LINE-5]]:
+#else
+  // expected-note@-8 {{use a constexpr variable}}
+  // expected-note@-8 {{use a constexpr variable}}
+  // CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:3-[[@LINE-10]]:8}:"constexpr auto e = "
+  // CXX11: fix-it:"{{.*}}":{[[@LINE-10]]:3-[[@LINE-10]]:8}:"constexpr auto e = "
+#endif
 }
 
 template <typename T>
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp
new file mode 100644
index 0000000000000..e7c90339a214a
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void f(int n) {
+  switch (n) {
+  case 0:
+    n += 1;
+    [[fallthrough]]; // ok
+  case 1:
+    if (n) {
+      [[fallthrough]]; // ok
+    } else {
+      return;
+    }
+  case 2:
+    for (int n = 0; n != 10; ++n)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 3:
+    while (true)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 4:
+    while (false)
+      [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+  case 5:
+    do [[fallthrough]]; while (true); // expected-error {{does not directly precede switch label}}
+  case 6:
+    do [[fallthrough]]; while (false); // expected-error {{does not directly precede switch label}}
+  case 7:
+    switch (n) {
+    case 0:
+      // FIXME: This should be an error, even though the next thing we do is to
+      // fall through in an outer switch statement.
+      [[fallthrough]];
+    }
+  case 8:
+    [[fallthrough]]; // expected-error {{does not directly precede switch label}}
+    goto label;
+  label:
+  case 9:
+    n += 1;
+  case 10: // no warning, -Wimplicit-fallthrough is not enabled in this test, and does not need to
+           // be enabled for these diagnostics to be produced.
+    break;
+  }
+}
+
+[[fallthrough]] typedef int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+typedef int [[fallthrough]] n; // expected-error {{'fallthrough' attribute cannot be applied to types}}
+typedef int n [[fallthrough]]; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+
+enum [[fallthrough]] E {}; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+class [[fallthrough]] C {}; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+
+[[fallthrough]] // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+void g() {
+  [[fallthrough]] int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}}
+  [[fallthrough]] ++n; // expected-error-re {{{{^}}fallthrough attribute is only allowed on empty statements}}
+
+  switch (n) {
+    // FIXME: This should be an error.
+    [[fallthrough]];
+    return;
+
+  case 0:
+    [[fallthrough, fallthrough]]; // expected-error {{multiple times}}
+  case 1:
+    [[fallthrough(0)]]; // expected-error {{argument list}}
+  case 2:
+    break;
+  }
+}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp
new file mode 100644
index 0000000000000..192fa12610987
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.grammar/p2-1z.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+[[disable_tail_calls, noduplicate]] void f() {} // expected-warning {{unknown attribute 'disable_tail_calls'}} expected-warning {{unknown attribute 'noduplicate'}}
+
+[[using clang: disable_tail_calls, noduplicate]] void g() {} // ok
+
+[[using]] extern int n; // expected-error {{expected identifier}}
+[[using foo
+] // expected-error {{expected ':'}}
+] extern int n;
+[[using 42:]] extern int n; // expected-error {{expected identifier}}
+[[using clang:]] extern int n; // ok
+[[using blah: clang::optnone]] extern int n; // expected-error {{attribute with scope specifier cannot follow}} expected-warning {{only applies to functions}}
+
+[[using clang: unknown_attr]] extern int n; // expected-warning {{unknown attribute}}
+[[using unknown_ns: something]] extern int n; // expected-warning {{unknown attribute}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp
new file mode 100644
index 0000000000000..e7a2382412952
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p1.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z -verify %s
+
+struct [[nodiscard]] S1 {}; // ok
+struct [[nodiscard nodiscard]] S2 {}; // expected-error {{attribute 'nodiscard' cannot appear multiple times in an attribute specifier}}
+struct [[nodiscard("Wrong")]] S3 {}; // expected-error {{'nodiscard' cannot have an argument list}}
+
+[[nodiscard]] int f();
+enum [[nodiscard]] E {};
+
+namespace [[nodiscard]] N {} // expected-warning {{'nodiscard' attribute only applies to functions, methods, enums, and classes}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
new file mode 100644
index 0000000000000..3d4b92518810b
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z -verify -Wc++1z-extensions %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 -verify -DEXT -Wc++1z-extensions %s
+
+struct [[nodiscard]] S {};
+S get_s();
+S& get_s_ref();
+
+enum [[nodiscard]] E {};
+E get_e();
+
+[[nodiscard]] int get_i();
+
+void f() {
+  get_s(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_i(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  get_e(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  // Okay, warnings are not encouraged
+  get_s_ref();
+  (void)get_s();
+  (void)get_i();
+  (void)get_e();
+}
+
+#ifdef EXT
+// expected-warning@4 {{use of the 'nodiscard' attribute is a C++1z extension}}
+// expected-warning@8 {{use of the 'nodiscard' attribute is a C++1z extension}}
+// expected-warning@11 {{use of the 'nodiscard' attribute is a C++1z extension}}
+#endif
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
new file mode 100644
index 0000000000000..a3543cff7d2c9
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p3.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+namespace std_example {
+  struct [[nodiscard]] error_info{
+    // ...
+  };
+
+  error_info enable_missile_safety_mode();
+  void launch_missiles();
+  void test_missiles() {
+    enable_missile_safety_mode(); // expected-warning {{ignoring return value of function declared with 'nodiscard'}}
+    launch_missiles();
+  }
+
+  error_info &foo();
+  void f() { foo(); } // no warning
+}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp
new file mode 100644
index 0000000000000..8da2ca7d6d86b
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p1.cpp
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+
+struct [[maybe_unused]] S1 {}; // ok
+struct [[maybe_unused maybe_unused]] S2 {}; // expected-error {{attribute 'maybe_unused' cannot appear multiple times in an attribute specifier}}
+struct [[maybe_unused("Wrong")]] S3 {}; // expected-error {{'maybe_unused' cannot have an argument list}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp
new file mode 100644
index 0000000000000..b539ca48ae34d
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p2.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+
+struct [[maybe_unused]] S {
+  int I [[maybe_unused]];
+  static int SI [[maybe_unused]]; // expected-warning {{'maybe_unused' attribute only applies to variables, functions, methods, types, enumerations, enumerators, labels, and non-static data members}}
+};
+
+enum [[maybe_unused]] E1 {
+  EnumVal [[maybe_unused]]
+};
+
+[[maybe_unused]] void unused_func([[maybe_unused]] int parm) {
+  typedef int maybe_unused_int [[maybe_unused]];
+  [[maybe_unused]] int I;
+}
+
+namespace [[maybe_unused]] N {} // expected-warning {{'maybe_unused' attribute only applies to}}
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp
new file mode 100644
index 0000000000000..a627d8331a746
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p3.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -Wused-but-marked-unused -std=c++1z -Wc++1z-extensions -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wunused -Wused-but-marked-unused -std=c++11 -Wc++1z-extensions -verify -DEXT %s
+
+static_assert(__has_cpp_attribute(maybe_unused) == 201603, "");
+
+struct [[maybe_unused]] S {};
+
+void f() {
+  int x; // expected-warning {{unused variable}}
+  typedef int I; // expected-warning {{unused typedef 'I'}}
+
+  // Should not warn about these due to not being used.
+  [[maybe_unused]] int y;
+  typedef int maybe_unused_int [[maybe_unused]];
+
+  // Should not warn about these uses.
+  S s;
+  maybe_unused_int test;
+  y = 12;
+}
+
+#ifdef EXT
+// expected-warning@6 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+// expected-warning@13 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+// expected-warning@14 {{use of the 'maybe_unused' attribute is a C++1z extension}}
+#endif
diff --git a/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp
new file mode 100644
index 0000000000000..d4a275930d64e
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.attr/dcl.attr.unused/p4.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused -std=c++1z -verify %s
+// expected-no-diagnostics
+
+void f();
+[[maybe_unused]] void f();
+
+void f() {
+}
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
index 35dbec93e5bb3..5a4c5c9a9d630 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.constexpr/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
 
 struct notlit { // expected-note {{not literal because}}
   notlit() {}
@@ -26,7 +28,12 @@ void f2(constexpr int i) {} // expected-error {{function parameter cannot be con
 // non-static member
 struct s2 {
   constexpr int mi1; // expected-error {{non-static data member cannot be constexpr; did you intend to make it const?}}
-  static constexpr int mi2; // expected-error {{requires an initializer}}
+  static constexpr int mi2;
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{requires an initializer}}
+#else
+  // expected-error@-4 {{default initialization of an object of const}}
+#endif
   mutable constexpr int mi3 = 3; // expected-error-re {{non-static data member cannot be constexpr{{$}}}} expected-error {{'mutable' and 'const' cannot be mixed}}
 };
 // typedef
@@ -71,7 +78,7 @@ struct ConstexprDtor {
 template <typename T> constexpr T ft(T t) { return t; }
 template <typename T> T gt(T t) { return t; }
 struct S {
-  template<typename T> constexpr T f(); // expected-warning {{C++14}}
+  template<typename T> constexpr T f(); // expected-warning 0-1{{C++14}} expected-note 0-1{{candidate}}
   template <typename T>
   T g() const; // expected-note-re {{candidate template ignored: could not match 'T (){{( __attribute__\(\(thiscall\)\))?}} const' against 'char (){{( __attribute__\(\(thiscall\)\))?}}'}}
 };
@@ -82,7 +89,15 @@ template <> char ft(char c) { return c; } // expected-note {{previous}}
 template <> constexpr char ft(char nl); // expected-error {{constexpr declaration of 'ft<char>' follows non-constexpr declaration}}
 template <> constexpr int gt(int nl) { return nl; }
 template <> notlit S::f() const { return notlit(); }
-template <> constexpr int S::g() { return 0; } // expected-note {{previous}} expected-warning {{C++14}}
+#if __cplusplus >= 201402L
+// expected-error@-2 {{no function template matches}}
+#endif
+template <> constexpr int S::g() { return 0; } // expected-note {{previous}}
+#if __cplusplus < 201402L
+// expected-warning@-2 {{C++14}}
+#else
+// expected-error@-4 {{does not match any declaration in 'S'}}
+#endif
 template <> int S::g() const; // expected-error {{non-constexpr declaration of 'g<int>' follows constexpr declaration}}
 // specializations can drop the 'constexpr' but not the implied 'const'.
 template <> char S::g() { return 0; } // expected-error {{no function template matches}}
@@ -123,3 +138,11 @@ int next(constexpr int x) { // expected-error {{function parameter cannot be con
 }
 
 extern constexpr int memsz; // expected-error {{constexpr variable declaration must be a definition}}
+
+namespace {
+  struct A {
+    static constexpr int n = 0;
+  };
+  // FIXME: We should diagnose this prior to C++17.
+  const int &r = A::n;
+}
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp
new file mode 100644
index 0000000000000..6db0b04a7496e
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p1.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+inline int f(); // ok
+inline int n; // ok
+
+inline typedef int t; // expected-error {{'inline' can only appear on functions and non-local variables}}
+inline struct S {}; // expected-error {{'inline' can only appear on functions and non-local variables}}
+inline struct T {} s; // ok
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp
new file mode 100644
index 0000000000000..0ca7bbc5fad37
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.inline/p5.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void x() {
+  inline int f(int); // expected-error {{inline declaration of 'f' not allowed in block scope}}
+  inline int n; // expected-error {{inline declaration of 'n' not allowed in block scope}}
+  static inline int m; // expected-error {{inline declaration of 'm' not allowed in block scope}}
+}
+
+inline void g();
+struct X {
+  inline void f();
+  // FIXME: This is ill-formed per [dcl.inline]p5.
+  inline void g();
+  inline void h() {}
+};
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp
new file mode 100644
index 0000000000000..e41270ee3cd29
--- /dev/null
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.type/dcl.spec.auto/p2-1z.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+template<typename T, typename U> constexpr bool same = false;
+template<typename T> constexpr bool same<T, T> = true;
+
+auto a() {
+  if constexpr (false)
+    return 0;
+}
+static_assert(same<decltype(a()), void>);
+
+auto b() {
+  if constexpr (false)
+    return 0;
+  else
+    return 0.0;
+}
+static_assert(same<decltype(b()), double>);
+
+auto c() {
+  if constexpr (true)
+    return "foo";
+  else
+    return 'x';
+  if constexpr (false)
+    return 7.6;
+  else
+    return 5; // expected-error {{deduced as 'int' here but deduced as 'const char *' in earlier}}
+}
+
+template<int k> auto d() {
+  if constexpr(k == 0)
+    return 0;
+  if constexpr(k == 1)
+    return "foo";
+  else if constexpr (k == 2)
+    return 1.0;
+}
+static_assert(same<decltype(d<0>()), int>);
+static_assert(same<decltype(d<1>()), const char *>);
+static_assert(same<decltype(d<2>()), double>);
+static_assert(same<decltype(d<3>()), void>);
+
+auto e = []{ if constexpr (false) return 0; }(); // expected-error {{variable has incomplete type 'void'}}
+
+auto f = []{ if constexpr (true) return 0; }();
+static_assert(same<decltype(e), int>);
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
index 39d6e706b6c1c..447f7c5d6cf38 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.type/p3-0x.cpp
@@ -18,6 +18,9 @@ void f() {
   for (struct S { S(int) {} } s : arr) { // expected-error {{types may not be defined in a for range declaration}}
   }
 
+  for (struct S { S(int) {} } s : Undeclared); // expected-error{{types may not be defined in a for range declaration}}
+                                               // expected-error@-1{{use of undeclared identifier 'Undeclared'}}
+
   new struct T {}; // expected-error {{'T' cannot be defined in a type specifier}}
   new struct A {}; // expected-error {{'A' cannot be defined in a type specifier}}
 
diff --git a/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp b/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
index 20b5104f83be7..8c6f6e5ddc796 100644
--- a/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
+++ b/test/CXX/dcl.dcl/dcl.spec/dcl.typedef/p2-0x.cpp
@@ -38,8 +38,8 @@ namespace VariableLengthArrays {
   using T = int[n]; // expected-error {{variable length array declaration not allowed at file scope}}
 
   const int m = 42;
-  using U = int[m]; // expected-note {{previous definition}}
-  using U = int[42]; // ok
+  using U = int[m];
+  using U = int[42]; // expected-note {{previous definition}}
   using U = int; // expected-error {{type alias redefinition with different types ('int' vs 'int [42]')}}
 
   void f() {
diff --git a/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp b/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
index 5cf281c185378..c2f3b5a04574b 100644
--- a/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
+++ b/test/CXX/dcl.decl/dcl.fct.def/dcl.fct.def.default/p2.cpp
@@ -116,6 +116,7 @@ static_assert(!noexcept(e5 = e5), "");
 namespace PR13492 {
   struct B {
     B() = default;
+    int field;
   };
 
   void f() {
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp
deleted file mode 100644
index 8767678362a3c..0000000000000
--- a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1-0x.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1y %s -DCXX1Y
-
-// An aggregate is an array or a class...
-struct Aggr {
-private:
-  static const int n;
-  void f();
-protected:
-  struct Inner { int m; };
-public:
-  bool &br; // expected-note {{default constructor of 'Aggr' is implicitly deleted because field 'br' of reference type 'bool &' would not be initialized}}
-};
-bool b;
-Aggr ag = { b };
-
-// with no user-provided constructors, ...
-struct NonAggr1a { // expected-note 2 {{candidate constructor}}
-  NonAggr1a(int, int); // expected-note {{candidate constructor}}
-  int k;
-};
-NonAggr1a na1a = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1a'}}
-
-struct NonAggr1b {
-  NonAggr1b(const NonAggr1b &); // expected-note {{candidate constructor}}
-  int k;
-};
-NonAggr1b na1b = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1b'}}
-
-// no brace-or-equal-initializers for non-static data members, ...
-// Note, this bullet was removed in C++1y.
-struct NonAggr2 {
-  int m = { 123 };
-};
-NonAggr2 na2 = { 42 };
-#ifndef CXX1Y
-// expected-error@-2 {{no matching constructor for initialization of 'NonAggr2'}}
-// expected-note@-6 3 {{candidate constructor}}
-#endif
-
-// no private...
-struct NonAggr3 { // expected-note 3 {{candidate constructor}}
-private:
-  int n;
-};
-NonAggr3 na3 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr3'}}
-
-// or protected non-static data members, ...
-struct NonAggr4 { // expected-note 3 {{candidate constructor}}
-protected:
-  int n;
-};
-NonAggr4 na4 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr4'}}
-
-// no base classes, ...
-struct NonAggr5 : Aggr { // expected-note 3 {{candidate constructor}}
-};
-NonAggr5 na5 = { b }; // expected-error {{no matching constructor for initialization of 'NonAggr5'}}
-template<typename...BaseList>
-struct MaybeAggr5a : BaseList... {}; // expected-note {{default constructor of 'MaybeAggr5a<Aggr>' is implicitly deleted because base class 'Aggr' has a deleted default constructor}}
-MaybeAggr5a<> ma5a0 = {}; // ok
-MaybeAggr5a<Aggr> ma5a1 = {}; // expected-error {{call to implicitly-deleted default constructor of 'MaybeAggr5a<Aggr>'}}
-
-// and no virtual functions.
-struct NonAggr6 { // expected-note 3 {{candidate constructor}}
-  virtual void f();
-  int n;
-};
-NonAggr6 na6 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr6'}}
-
-struct DefaultedAggr {
-  int n;
-
-  DefaultedAggr() = default;
-  DefaultedAggr(const DefaultedAggr &) = default;
-  DefaultedAggr(DefaultedAggr &&) = default;
-  DefaultedAggr &operator=(const DefaultedAggr &) = default;
-  DefaultedAggr &operator=(DefaultedAggr &&) = default;
-  ~DefaultedAggr() = default;
-};
-DefaultedAggr da = { 42 } ;
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp
new file mode 100644
index 0000000000000..40f6431e3ddb8
--- /dev/null
+++ b/test/CXX/dcl.decl/dcl.init/dcl.init.aggr/p1.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s
+
+// An aggregate is an array or a class...
+struct Aggr {
+private:
+  static const int n;
+  void f();
+protected:
+  struct Inner { int m; };
+public:
+  bool &br;
+};
+bool b;
+Aggr ag = { b };
+
+// with no user-provided constructors, ...
+struct NonAggr1a { // expected-note 2 {{candidate constructor}}
+  NonAggr1a(int, int); // expected-note {{candidate constructor}}
+  int k;
+};
+NonAggr1a na1a = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1a'}}
+
+struct NonAggr1b {
+  NonAggr1b(const NonAggr1b &); // expected-note {{candidate constructor}}
+  int k;
+};
+NonAggr1b na1b = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr1b'}}
+
+// no brace-or-equal-initializers for non-static data members, ...
+// Note, this bullet was removed in C++1y.
+struct NonAggr2 {
+  int m = { 123 };
+};
+NonAggr2 na2 = { 42 };
+#if __cplusplus < 201402L
+// expected-error@-2 {{no matching constructor for initialization of 'NonAggr2'}}
+// expected-note@-6 3 {{candidate constructor}}
+#endif
+
+// no private...
+struct NonAggr3 { // expected-note 3 {{candidate constructor}}
+private:
+  int n;
+};
+NonAggr3 na3 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr3'}}
+
+// or protected non-static data members, ...
+struct NonAggr4 { // expected-note 3 {{candidate constructor}}
+protected:
+  int n;
+};
+NonAggr4 na4 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr4'}}
+
+// [pre-C++1z] no base classes, ...
+struct NonAggr5 : Aggr {
+};
+NonAggr5 na5 = { b };
+#if __cplusplus <= 201402L
+// expected-error@-2 {{no matching constructor for initialization of 'NonAggr5'}}
+// expected-note@-5 3 {{candidate constructor}}
+#endif
+template<typename...BaseList>
+struct MaybeAggr5a : BaseList... {};
+MaybeAggr5a<> ma5a0 = {}; // ok
+MaybeAggr5a<Aggr> ma5a1 = {}; // ok in C++17
+MaybeAggr5a<NonAggr2> m5a2 = {}; // ok, aggregate init in C++17, default ctor in C++11 and C++14
+MaybeAggr5a<NonAggr2> m5a3 = {0}; // ok in C++17, overrides default member initializer in base class
+#if __cplusplus <= 201402L
+// expected-error@-4 {{call to implicitly-deleted default constructor of 'MaybeAggr5a<Aggr>'}}
+// expected-note@-7 {{default constructor of 'MaybeAggr5a<Aggr>' is implicitly deleted because base class 'Aggr' has a deleted default constructor}}
+// expected-note@13 {{default constructor of 'Aggr' is implicitly deleted because field 'br' of reference type 'bool &' would not be initialized}}
+// expected-error@-5 {{no matching constructor}} expected-note@-9 3{{candidate}}
+#else
+// expected-error@-9 {{reference member of type 'bool &' uninitialized}}
+// expected-note@13 {{uninitialized reference member is here}}
+#endif
+
+// [C++1z] no virtual, protected, or private base classes, ...
+struct NonAggr5b : virtual Aggr {}; // expected-note 3{{candidate}}
+NonAggr5b na5b = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5c : NonAggr5b {}; // expected-note 3{{candidate}}
+NonAggr5c na5c = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5d : protected Aggr {}; // expected-note 3{{candidate}}
+NonAggr5d na5d = { b }; // expected-error {{no matching constructor}}
+struct NonAggr5e : private Aggr {}; // expected-note 3{{candidate}}
+NonAggr5e na5e = { b }; // expected-error {{no matching constructor}}
+class NonAggr5f : Aggr {}; // expected-note 3{{candidate}}
+NonAggr5f na5f = { b }; // expected-error {{no matching constructor}}
+
+// [C++1z] (the base class need not itself be an aggregate)
+struct MaybeAggr5g : NonAggr1a {};
+MaybeAggr5g ma5g1 = { 1 };
+MaybeAggr5g ma5g2 = { {1, 2} };
+MaybeAggr5g ma5g3 = {};
+#if __cplusplus <= 201402L
+// expected-error@-4 {{no matching constructor}} // expected-note@-5 3{{candidate}}
+// expected-error@-4 {{no matching constructor}} // expected-note@-6 3{{candidate}}
+// expected-error@-4 {{implicitly-deleted default constructor}} expected-note@-7 {{no default constructor}}
+#else
+// expected-error@-8 {{no viable conversion from 'int' to 'NonAggr1a'}} expected-note@19 2{{candidate}}
+// (ok)
+// expected-error@-8 {{no matching constructor}} expected-note@19 2{{candidate}} expected-note@20 {{candidate}}
+#endif
+
+// and no virtual functions.
+struct NonAggr6 { // expected-note 3 {{candidate constructor}}
+  virtual void f();
+  int n;
+};
+NonAggr6 na6 = { 42 }; // expected-error {{no matching constructor for initialization of 'NonAggr6'}}
+
+struct DefaultedAggr {
+  int n;
+
+  DefaultedAggr() = default;
+  DefaultedAggr(const DefaultedAggr &) = default;
+  DefaultedAggr(DefaultedAggr &&) = default;
+  DefaultedAggr &operator=(const DefaultedAggr &) = default;
+  DefaultedAggr &operator=(DefaultedAggr &&) = default;
+  ~DefaultedAggr() = default;
+};
+DefaultedAggr da = { 42 } ;
diff --git a/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3-0x.cpp b/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3.cpp
index d7ffd0758a68a..f381ed708f702 100644
--- a/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3-0x.cpp
+++ b/test/CXX/dcl.decl/dcl.init/dcl.init.list/p3.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
+// FIXME: Remove the triple when PR27098 is fixed.
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s -triple %itanium_abi_triple
 
 namespace std {
   typedef decltype(sizeof(int)) size_t;
@@ -125,3 +128,128 @@ namespace rdar13395022 {
     // expected-note@-2 {{in initialization of temporary of type 'rdar13395022::MoveOnly [1]' created to list-initialize this reference}}
   }
 }
+
+namespace cxx1z_direct_enum_init {
+  enum A {};
+  enum B : char {};
+  enum class C {};
+  enum class D : char {};
+  enum class E : char { k = 5 };
+
+  template<typename T> void good() {
+    (void)T{0};
+    T t1{0};
+    T t2 = T{0};
+
+    struct S { T t; };
+    S s{T{0}};
+
+    struct U { T t{0}; } u; // expected-note 0+{{instantiation of}}
+
+    struct V { T t; V() : t{0} {} }; // expected-note 0+{{instantiation of}}
+
+    void f(T);
+    f(T{0});
+  }
+#if __cplusplus <= 201402L
+  // expected-error@-15 5{{cannot initialize}}
+  // expected-error@-15 5{{cannot initialize}}
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  // expected-error@-15 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-15 5{{cannot initialize}}
+#else
+  // expected-error@-29 {{cannot initialize}}
+  // expected-error@-29 {{cannot initialize}}
+  // expected-error@-29 {{cannot initialize}}
+  //
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  // expected-error@-29 {{cannot initialize}}
+  //
+  //
+  // expected-error@-29 {{cannot initialize}}
+#endif
+
+  template<typename T> void bad() {
+    T t = {0};
+
+    struct S { T t; };
+    S s1{0};
+    S s2{{0}};
+
+    struct U { T t = {0}; } u; // expected-note 0+{{instantiation of}}
+
+    struct V { T t; V() : t({0}) {} }; // expected-note 0+{{instantiation of}}
+
+    void f(T); // expected-note 0+{{passing argument}}
+    f({0});
+  }
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  // expected-error@-13 5{{cannot initialize}}
+  //
+  //
+  // expected-error@-13 5{{cannot initialize}}
+
+  template<typename T> void ugly() {
+    extern char c;
+    T t1{char('0' + c)};
+    T t2{'0' + c};
+    T t3{1234};
+  }
+#if __cplusplus <= 201402L
+  // expected-error@-5 4{{cannot initialize}}
+  // expected-error@-5 4{{cannot initialize}}
+  // expected-error@-5 4{{cannot initialize}}
+#else
+  // expected-error@-8 3{{non-constant-expression cannot be narrowed}}
+  // expected-error@-8 3{{constant expression evaluates to 1234 which cannot be narrowed}} expected-warning@-8 {{changes value}}
+#endif
+
+  void test() {
+    good<A>(); // expected-note 4{{instantiation of}}
+    good<B>();
+    good<C>();
+    good<D>();
+    good<E>();
+#if __cplusplus <= 201402L
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+    // expected-note@-5 4{{instantiation of}}
+#endif
+
+    bad<A>(); // expected-note 4{{instantiation of}}
+    bad<B>(); // expected-note 4{{instantiation of}}
+    bad<C>(); // expected-note 4{{instantiation of}}
+    bad<D>(); // expected-note 4{{instantiation of}}
+    bad<E>(); // expected-note 4{{instantiation of}}
+
+    ugly<B>(); // expected-note {{instantiation of}}
+    ugly<C>(); // ok
+    ugly<D>(); // expected-note {{instantiation of}}
+    ugly<E>(); // expected-note {{instantiation of}}
+#if __cplusplus <= 201402L
+    // expected-note@-4 {{instantiation of}}
+#else
+    (void)B{0.0}; // expected-error {{type 'double' cannot be narrowed}}
+#endif
+  }
+}
diff --git a/test/CXX/dcl.decl/dcl.init/p6.cpp b/test/CXX/dcl.decl/dcl.init/p6.cpp
index e404a1ebc19e9..b646ba776a9cf 100644
--- a/test/CXX/dcl.decl/dcl.init/p6.cpp
+++ b/test/CXX/dcl.decl/dcl.init/p6.cpp
@@ -4,9 +4,9 @@
 
 // If a program calls for the default initialization of an object of a
 // const-qualified type T, T shall be a class type with a
-// user-provided default constructor.
+// user-provided default constructor, except if T has no uninitialized fields.
 struct MakeNonPOD { MakeNonPOD(); };
-struct NoUserDefault : public MakeNonPOD { };
+struct NoUserDefault : public MakeNonPOD { int field; };
 struct HasUserDefault { HasUserDefault(); };
 
 void test_const_default_init() {
@@ -16,7 +16,7 @@ void test_const_default_init() {
 }
 
 // rdar://8501008
-struct s0 {};
+struct s0 { int field; };
 struct s1 { static const s0 foo; };
 const struct s0 s1::foo; // expected-error{{default initialization of an object of const type 'const struct s0' without a user-provided default constructor}}
 
diff --git a/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp b/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
index 4686b1c961ec3..188a0a2c7eed0 100644
--- a/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
+++ b/test/CXX/dcl.decl/dcl.meaning/dcl.array/p3.cpp
@@ -207,3 +207,7 @@ namespace use_outside_ns {
     int j() { return sizeof(d); }
   }
 }
+
+extern int arr[];
+void f1() { extern int arr[2]; } // expected-note {{previous}}
+void f2() { extern int arr[3]; } // expected-error {{different type: 'int [3]' vs 'int [2]'}}
diff --git a/test/CXX/drs/dr12xx.cpp b/test/CXX/drs/dr12xx.cpp
new file mode 100644
index 0000000000000..048c21acde568
--- /dev/null
+++ b/test/CXX/drs/dr12xx.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c++98 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++11 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
+
+// expected-no-diagnostics
+
+namespace dr1250 {  // dr1250: 3.9
+struct Incomplete;
+
+struct Base {
+  virtual const Incomplete *meow() = 0;
+};
+
+struct Derived : Base {
+  virtual Incomplete *meow();
+};
+} // dr1250
diff --git a/test/CXX/drs/dr13xx.cpp b/test/CXX/drs/dr13xx.cpp
index 37c144eb0e015..8c3e7f2a04ca1 100644
--- a/test/CXX/drs/dr13xx.cpp
+++ b/test/CXX/drs/dr13xx.cpp
@@ -28,3 +28,19 @@ namespace dr1346 { // dr1346: 3.5
   }
 #endif
 }
+
+namespace dr1359 { // dr1359: 3.5
+#if __cplusplus >= 201103L
+  union A { constexpr A() = default; };
+  union B { constexpr B() = default; int a; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+  union C { constexpr C() = default; int a, b; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+  struct X { constexpr X() = default; union {}; };
+  struct Y { constexpr Y() = default; union { int a; }; }; // expected-error {{not constexpr}} expected-note 2{{candidate}}
+
+  constexpr A a = A();
+  constexpr B b = B(); // expected-error {{no matching}}
+  constexpr C c = C(); // expected-error {{no matching}}
+  constexpr X x = X();
+  constexpr Y y = Y(); // expected-error {{no matching}}
+#endif
+}
diff --git a/test/CXX/drs/dr15xx.cpp b/test/CXX/drs/dr15xx.cpp
index 7472be72c2f13..5f85a196fd6af 100644
--- a/test/CXX/drs/dr15xx.cpp
+++ b/test/CXX/drs/dr15xx.cpp
@@ -22,6 +22,31 @@ namespace dr1560 { // dr1560: 3.5
   const X &x = true ? get() : throw 0;
 }
 
+namespace dr1573 { // dr1573: 3.9
+#if __cplusplus >= 201103L
+  // ellipsis is inherited (p0136r1 supersedes this part).
+  struct A { A(); A(int, char, ...); };
+  struct B : A { using A::A; };
+  B b(1, 'x', 4.0, "hello"); // ok
+
+  // inherited constructor is effectively constexpr if the user-written constructor would be
+  struct C { C(); constexpr C(int) {} };
+  struct D : C { using C::C; };
+  constexpr D d = D(0); // ok
+  struct E : C { using C::C; A a; }; // expected-note {{non-literal type}}
+  constexpr E e = E(0); // expected-error {{non-literal type}}
+  // FIXME: This diagnostic is pretty bad; we should explain that the problem
+  // is that F::c would be initialized by a non-constexpr constructor.
+  struct F : C { using C::C; C c; }; // expected-note {{here}}
+  constexpr F f = F(0); // expected-error {{constant expression}} expected-note {{constructor inherited from base class 'C'}}
+
+  // inherited constructor is effectively deleted if the user-written constructor would be
+  struct G { G(int); };
+  struct H : G { using G::G; G g; }; // expected-note {{constructor inherited by 'H' is implicitly deleted because field 'g' has no default constructor}}
+  H h(0); // expected-error {{constructor inherited by 'H' from base class 'G' is implicitly deleted}}
+#endif
+}
+
 #if __cplusplus >= 201103L
 namespace std {
   typedef decltype(sizeof(int)) size_t;
@@ -62,6 +87,62 @@ namespace std {
 
 } // std
 
+namespace dr1579 { // dr1579: 3.9
+template<class T>
+struct GenericMoveOnly {
+  GenericMoveOnly();
+  template<class U> GenericMoveOnly(const GenericMoveOnly<U> &) = delete; // expected-note 5 {{marked deleted here}}
+  GenericMoveOnly(const int &) = delete; // expected-note 2 {{marked deleted here}}
+  template<class U> GenericMoveOnly(GenericMoveOnly<U> &&);
+  GenericMoveOnly(int &&);
+};
+
+GenericMoveOnly<float> DR1579_Eligible(GenericMoveOnly<char> CharMO) {
+  int i;
+  GenericMoveOnly<char> GMO;
+
+  if (0)
+    return i;
+  else if (0)
+    return GMO;
+  else if (0)
+    return ((GMO));
+  else
+    return CharMO;
+}
+
+GenericMoveOnly<char> GlobalMO;
+
+GenericMoveOnly<float> DR1579_Ineligible(int &AnInt,
+                                          GenericMoveOnly<char> &CharMO) {
+  static GenericMoveOnly<char> StaticMove;
+  extern GenericMoveOnly<char> ExternMove;
+
+  if (0)
+    return AnInt; // expected-error{{invokes a deleted function}}
+  else if (0)
+    return GlobalMO; // expected-error{{invokes a deleted function}}
+  else if (0)
+    return StaticMove; // expected-error{{invokes a deleted function}}
+  else if (0)
+    return ExternMove; // expected-error{{invokes a deleted function}}
+  else if (0)
+    return AnInt; // expected-error{{invokes a deleted function}}
+  else
+    return CharMO; // expected-error{{invokes a deleted function}}
+}
+
+auto DR1579_lambda_valid = [](GenericMoveOnly<float> mo) ->
+  GenericMoveOnly<char> {
+  return mo;
+};
+
+auto DR1579_lambda_invalid = []() -> GenericMoveOnly<char> {
+  static GenericMoveOnly<float> mo;
+  return mo; // expected-error{{invokes a deleted function}}
+};
+} // end namespace dr1579
+
 namespace dr1589 {   // dr1589: 3.7 c++11
   // Ambiguous ranking of list-initialization sequences
 
diff --git a/test/CXX/drs/dr16xx.cpp b/test/CXX/drs/dr16xx.cpp
index ddb7d16869ba8..65467e35e4d68 100644
--- a/test/CXX/drs/dr16xx.cpp
+++ b/test/CXX/drs/dr16xx.cpp
@@ -18,8 +18,8 @@ namespace dr1684 { // dr1684: 3.6
 #endif
 }
 
+namespace dr1631 {  // dr1631: 3.7
 #if __cplusplus >= 201103L
-namespace dr1631 {  // dr1631: 3.7 c++11
   // Incorrect overload resolution for single-element initializer-list
 
   struct A { int a[1]; };
@@ -41,5 +41,22 @@ namespace dr1631 {  // dr1631: 3.7 c++11
       f({0}, {{1}});        // expected-error{{call to 'f' is ambiguous}}
     }
   }
-} // dr1631
 #endif
+}
+
+namespace dr1645 { // dr1645: 3.9
+#if __cplusplus >= 201103L
+  struct A { // expected-note 2{{candidate}}
+    constexpr A(int, float = 0); // expected-note 2{{candidate}}
+    explicit A(int, int = 0); // expected-note 2{{candidate}}
+    A(int, int, int = 0) = delete; // expected-note {{candidate}}
+  };
+
+  struct B : A { // expected-note 2{{candidate}}
+    using A::A; // expected-note 7{{inherited here}}
+  };
+
+  constexpr B a(0); // expected-error {{ambiguous}}
+  constexpr B b(0, 0); // expected-error {{ambiguous}}
+#endif
+}
diff --git a/test/CXX/drs/dr17xx.cpp b/test/CXX/drs/dr17xx.cpp
index 1ab8c40d3972f..a917412adcbde 100644
--- a/test/CXX/drs/dr17xx.cpp
+++ b/test/CXX/drs/dr17xx.cpp
@@ -3,19 +3,63 @@
 // RUN: %clang_cc1 -std=c++14 %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++1z %s -verify -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus < 201103L
 // expected-no-diagnostics
+#endif
 
+namespace dr1715 { // dr1715: 3.9
+#if __cplusplus >= 201103L
+  struct B {
+    template<class T> B(T, typename T::Q);
+  };
+
+  class S {
+    using Q = int;
+    template<class T> friend B::B(T, typename T::Q);
+  };
+
+  struct D : B {
+    using B::B;
+  };
+  struct E : B { // expected-note 2{{candidate}}
+    template<class T> E(T t, typename T::Q q) : B(t, q) {} // expected-note {{'Q' is a private member}}
+  };
+
+  B b(S(), 1);
+  D d(S(), 2);
+  E e(S(), 3); // expected-error {{no match}}
+#endif
+}
+
+namespace dr1736 { // dr1736: 3.9
+#if __cplusplus >= 201103L
+struct S {
+  template <class T> S(T t) {
+    struct L : S {
+      using S::S;
+    };
+    typename T::type value; // expected-error {{no member}}
+    L l(value); // expected-note {{instantiation of}}
+  }
+};
+struct Q { typedef int type; } q;
+S s(q); // expected-note {{instantiation of}}
+#endif
+}
+
+namespace dr1756 { // dr1756: 3.7
 #if __cplusplus >= 201103L
-namespace dr1756 {  // dr1756: 3.7 c++11
   // Direct-list-initialization of a non-class object
   
   int a{0};
   
   struct X { operator int(); } x;
   int b{x};
-} // dr1756
+#endif
+}
 
-namespace dr1758 {  // dr1758: 3.7 c++11
+namespace dr1758 { // dr1758: 3.7
+#if __cplusplus >= 201103L
   // Explicit conversion in copy/move list initialization
 
   struct X { X(); };
@@ -30,5 +74,5 @@ namespace dr1758 {  // dr1758: 3.7 c++11
     operator A() { return A(); }
   } b;
   A a{b};
-} // dr1758
 #endif
+}
diff --git a/test/CXX/drs/dr19xx.cpp b/test/CXX/drs/dr19xx.cpp
index 368e7b341652d..5b626dd80892b 100644
--- a/test/CXX/drs/dr19xx.cpp
+++ b/test/CXX/drs/dr19xx.cpp
@@ -39,6 +39,31 @@ namespace dr1902 { // dr1902: 3.7
 #endif
 }
 
+namespace dr1903 {
+  namespace A {
+    struct a {};
+    int a;
+    namespace B {
+      int b;
+    }
+    using namespace B;
+    namespace {
+      int c;
+    }
+    namespace D {
+      int d;
+    }
+    using D::d;
+  }
+  namespace X {
+    using A::a;
+    using A::b;
+    using A::c;
+    using A::d;
+    struct a *p;
+  }
+}
+
 namespace dr1909 { // dr1909: yes
   struct A {
     template<typename T> struct A {}; // expected-error {{member 'A' has the same name as its class}}
@@ -54,22 +79,52 @@ namespace dr1909 { // dr1909: yes
   };
 }
 
-#if __cplusplus >= 201103L
 namespace dr1940 { // dr1940: yes
+#if __cplusplus >= 201103L
 static union {
   static_assert(true, "");  // ok
   static_assert(false, ""); // expected-error {{static_assert failed}}
 };
-}
 #endif
+}
 
+namespace dr1941 { // dr1941: 3.9
 #if __cplusplus >= 201402L
+template<typename X>
+struct base {
+  template<typename T>
+  base(T a, T b, decltype(void(*T()), 0) = 0) {
+    while (a != b) (void)*a++;
+  }
+
+  template<typename T>
+  base(T a, X x, decltype(void(T(0) * 1), 0) = 0) {
+    for (T n = 0; n != a; ++n) (void)X(x);
+  }
+};
+
+struct derived : base<int> {
+  using base::base;
+};
+
+struct iter {
+  iter operator++(int);
+  int operator*();
+  friend bool operator!=(iter, iter);
+} it, end;
+
+derived d1(it, end);
+derived d2(42, 9);
+#endif
+}
+
 namespace dr1947 { // dr1947: yes
+#if __cplusplus >= 201402L
 unsigned o = 0'01;  // ok
 unsigned b = 0b'01; // expected-error {{invalid digit 'b' in octal constant}}
 unsigned x = 0x'01; // expected-error {{invalid suffix 'x'01' on integer constant}}
-}
 #endif
+}
 
 #if __cplusplus >= 201103L
 // dr1948: yes
@@ -77,10 +132,58 @@ unsigned x = 0x'01; // expected-error {{invalid suffix 'x'01' on integer constan
 void *operator new(__SIZE_TYPE__) noexcept { return nullptr; } // expected-error{{exception specification in declaration does not match previous declaration}}
 #endif
 
+namespace dr1959 { // dr1959: 3.9
 #if __cplusplus >= 201103L
+  struct b;
+  struct c;
+  struct a {
+    a() = default;
+    a(const a &) = delete; // expected-note 2{{deleted}}
+    a(const b &) = delete; // not inherited
+    a(c &&) = delete; // expected-note {{deleted}}
+    template<typename T> a(T) = delete;
+  };
+
+  struct b : a { // expected-note {{copy constructor of 'b' is implicitly deleted because base class 'dr1959::a' has a deleted copy constructor}}
+    using a::a;
+  };
+
+  a x;
+  b y = x; // expected-error {{deleted}}
+  b z = z; // expected-error {{deleted}}
+
+  // FIXME: It's not really clear that this matches the intent, but it's
+  // consistent with the behavior for assignment operators.
+  struct c : a {
+    using a::a;
+    c(const c &);
+  };
+  c q(static_cast<c&&>(q)); // expected-error {{call to deleted}}
+#endif
+}
+
 namespace dr1968 { // dr1968: yes
-static_assert(&typeid(int) == &typeid(int), ""); // expected-error{{not an integral constant expression}}
+#if __cplusplus >= 201103L
+  static_assert(&typeid(int) == &typeid(int), ""); // expected-error{{not an integral constant expression}}
+#endif
 }
+
+namespace dr1991 { // dr1991: 3.9
+#if __cplusplus >= 201103L
+  struct A {
+    A(int, int) = delete;
+  };
+
+  struct B : A {
+    using A::A;
+    B(int, int, int = 0);
+  };
+
+  // FIXME: As a resolution to an open DR against P0136R1, we treat derived
+  // class constructors as better than base class constructors in the presence
+  // of ambiguity.
+  B b(0, 0); // ok, calls B constructor
 #endif
+}
 
 // dr1994: dup 529
diff --git a/test/CXX/drs/dr1xx.cpp b/test/CXX/drs/dr1xx.cpp
index 47d1494ec7a3c..8d368a5a54e85 100644
--- a/test/CXX/drs/dr1xx.cpp
+++ b/test/CXX/drs/dr1xx.cpp
@@ -902,7 +902,11 @@ namespace dr183 { // dr183: sup 382
     typedef int X;
   };
   template<> struct A<int> {
+#if __cplusplus <= 199711
+    typename B<int>::X x; // expected-error {{'typename' occurs outside of a template}}
+#else
     typename B<int>::X x;
+#endif
   };
 }
 
diff --git a/test/CXX/drs/dr4xx.cpp b/test/CXX/drs/dr4xx.cpp
index bceea793faf8c..b1c21f8631dd1 100644
--- a/test/CXX/drs/dr4xx.cpp
+++ b/test/CXX/drs/dr4xx.cpp
@@ -702,8 +702,8 @@ namespace dr460 { // dr460: yes
   namespace X { namespace Q { int n; } }
   namespace Y {
     using X; // expected-error {{requires a qualified name}}
-    using dr460::X; // expected-error {{cannot refer to namespace}}
-    using X::Q; // expected-error {{cannot refer to namespace}}
+    using dr460::X; // expected-error {{cannot refer to a namespace}}
+    using X::Q; // expected-error {{cannot refer to a namespace}}
   }
 }
 
@@ -1197,12 +1197,12 @@ namespace dr496 { // dr496: no
   int check6[ __is_trivially_assignable(B, const B&) ? 1 : -1];
 }
 
-namespace dr497 { // dr497: yes
+namespace dr497 { // dr497: sup 253
   void before() {
     struct S {
       mutable int i;
     };
-    const S cs; // expected-error {{default initialization}}
+    const S cs;
     int S::*pm = &S::i;
     cs.*pm = 88; // expected-error {{not assignable}}
   }
diff --git a/test/CXX/drs/dr5xx.cpp b/test/CXX/drs/dr5xx.cpp
index 96d3494bf060f..e0bab57e5254b 100644
--- a/test/CXX/drs/dr5xx.cpp
+++ b/test/CXX/drs/dr5xx.cpp
@@ -814,7 +814,7 @@ namespace dr577 { // dr577: yes
   }
 }
 
-namespace dr580 { // dr580: no
+namespace dr580 { // dr580: partial
   class C;
   struct A { static C c; };
   struct B { static C c; };
@@ -822,7 +822,7 @@ namespace dr580 { // dr580: no
     C(); // expected-note {{here}}
     ~C(); // expected-note {{here}}
 
-    typedef int I; // expected-note {{here}}
+    typedef int I; // expected-note 2{{here}}
     template<int> struct X;
     template<int> friend struct Y;
     template<int> void f();
@@ -832,7 +832,20 @@ namespace dr580 { // dr580: no
 
   template<C::I> struct C::X {};
   template<C::I> struct Y {};
-  template<C::I> struct Z {}; // FIXME: should reject, accepted because C befriends A!
+  template<C::I> struct Z {}; // expected-error {{private}}
+
+  struct C2 {
+    class X {
+      struct A;
+      typedef int I;
+      friend struct A;
+    };
+    class Y {
+      template<X::I> struct A {}; // FIXME: We incorrectly accept this
+                                  // because we think C2::Y::A<...> might
+                                  // instantiate to C2::X::A
+    };
+  };
 
   template<C::I> void C::f() {}
   template<C::I> void g() {}
diff --git a/test/CXX/drs/dr6xx.cpp b/test/CXX/drs/dr6xx.cpp
index 988c8f43011d0..1d37a6d3e80c8 100644
--- a/test/CXX/drs/dr6xx.cpp
+++ b/test/CXX/drs/dr6xx.cpp
@@ -146,9 +146,9 @@ namespace dr616 { // dr616: no
 #if __cplusplus >= 201103L
   struct S { int n; } s;
   // FIXME: These should all be 'int &&'
-  using T = decltype(S().n); // expected-note 2{{previous}}
+  using T = decltype(S().n);
   using T = decltype(static_cast<S&&>(s).n);
-  using T = decltype(S().*&S::n);
+  using T = decltype(S().*&S::n); // expected-note 2{{previous}}
   using T = decltype(static_cast<S&&>(s).*&S::n); // expected-error {{different type}}
   using T = int&&; // expected-error {{different type}}
 #endif
diff --git a/test/CXX/except/except.spec/p14.cpp b/test/CXX/except/except.spec/p14.cpp
index 945a767584fec..c717d9779791a 100644
--- a/test/CXX/except/except.spec/p14.cpp
+++ b/test/CXX/except/except.spec/p14.cpp
@@ -124,14 +124,20 @@ namespace InhCtor {
   template<typename T> struct Throw {
     Throw() throw(T);
   };
-  struct Derived : Base, Throw<X<3>> {
+  struct Derived1 : Base, X<5> {
+    using Base::Base;
+    int n;
+  };
+  struct Derived2 : Base, Throw<X<3>> {
     using Base::Base;
-    Throw<X<4>> x;
   };
-  struct Test {
-    friend Derived::Derived(X<0>) throw(X<3>, X<4>);
-    friend Derived::Derived(X<1>) noexcept(false);
-    friend Derived::Derived(X<2>) throw(X<2>, X<3>, X<4>);
+  struct Derived3 : Base {
+    using Base::Base;
+    Throw<X<4>> x;
   };
-  static_assert(!noexcept(Derived{X<5>{}}), "");
+  static_assert(noexcept(Derived1(X<0>())), "");
+  static_assert(!noexcept(Derived1(X<1>())), "");
+  static_assert(!noexcept(Derived1(X<2>())), "");
+  static_assert(!noexcept(Derived2(X<0>())), "");
+  static_assert(!noexcept(Derived3(X<0>())), "");
 }
diff --git a/test/CXX/expr/expr.const/p2-0x.cpp b/test/CXX/expr/expr.const/p2-0x.cpp
index c519ecbda2280..fd15960647ca0 100644
--- a/test/CXX/expr/expr.const/p2-0x.cpp
+++ b/test/CXX/expr/expr.const/p2-0x.cpp
@@ -242,8 +242,8 @@ namespace UndefinedBehavior {
     constexpr int n13 = n5 + n5; // expected-error {{constant expression}} expected-note {{value -4294967296 is outside the range of }}
     constexpr int n14 = n3 - n5; // expected-error {{constant expression}} expected-note {{value 4294967295 is outside the range of }}
     constexpr int n15 = n5 * n5; // expected-error {{constant expression}} expected-note {{value 4611686018427387904 is outside the range of }}
-    constexpr signed char c1 = 100 * 2; // ok
-    constexpr signed char c2 = '\x64' * '\2'; // also ok
+    constexpr signed char c1 = 100 * 2; // ok expected-warning{{changes value}}
+    constexpr signed char c2 = '\x64' * '\2'; // also ok  expected-warning{{changes value}}
     constexpr long long ll1 = 0x7fffffffffffffff; // ok
     constexpr long long ll2 = ll1 + 1; // expected-error {{constant}} expected-note {{ 9223372036854775808 }}
     constexpr long long ll3 = -ll1 - 1; // ok
diff --git a/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp b/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
new file mode 100644
index 0000000000000..eac6c40c6085e
--- /dev/null
+++ b/test/CXX/expr/expr.prim/expr.prim.lambda/p15-star-this-capture.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++1z %s -verify
+
+class NonCopyable {
+  NonCopyable(const NonCopyable&) = delete; //expected-note3{{explicitly marked deleted here}}
+  int x = 10;
+  void foo() {
+    auto L = [this] { return x; };
+    const auto &M = [*this] { return x; };//expected-error{{call to deleted}}
+    const auto &M2 = [this] () -> auto&& {
+      ++x;
+      return [*this] {  //expected-error{{call to deleted}} expected-warning{{reference to local}}
+         return ++x; //expected-error{{read-only}}
+      }; 
+    };
+    const auto &M3 = [*this] () mutable -> auto&& { //expected-error{{call to deleted}} 
+      ++x;
+      return [this] {  // expected-warning{{reference to local}}
+         return x;
+      }; 
+    };
+  }  
+};
diff --git a/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp b/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
index dc2c209af268a..b8504d4906500 100644
--- a/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
+++ b/test/CXX/expr/expr.prim/expr.prim.lambda/p5.cpp
@@ -11,7 +11,7 @@ void test_attributes() {
 
 template<typename T>
 struct bogus_override_if_virtual : public T {
-  bogus_override_if_virtual() : T(*(T*)0) { }
+  bogus_override_if_virtual() : T(*(T*)0) { } // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   int operator()() const;
 };
 
@@ -36,7 +36,7 @@ void test_quals() {
   lv(); // expected-error{{no matching function for call to object of type}}
   mlv(); // expected-error{{no matching function for call to object of type}}
 
-  bogus_override_if_virtual<decltype(l)> bogus;
+  bogus_override_if_virtual<decltype(l)> bogus; // expected-note{{in instantiation of member function 'bogus_override_if_virtual<(lambda}}
 }
 
 // Core issue 974: default arguments (8.3.6) may be specified in the
diff --git a/test/CXX/over/over.oper/over.literal/p5.cpp b/test/CXX/over/over.oper/over.literal/p5.cpp
index 66f3f97eaac37..bfad5f00cf6c7 100644
--- a/test/CXX/over/over.oper/over.literal/p5.cpp
+++ b/test/CXX/over/over.oper/over.literal/p5.cpp
@@ -12,11 +12,11 @@ template<typename T> struct U {
   friend U operator "" _a(const T *, size_t); // expected-error {{parameter}}
 };
 template<char...> struct V {
-  friend void operator "" _b(); // expected-error {{parameter}}
+  friend void operator "" _b(); // expected-error {{parameters}}
 };
 
-template<char... C, int N = 0> void operator "" _b(); // expected-error {{parameter}}
-template<char... C> void operator "" _b(int N = 0); // expected-error {{parameter}}
-template<char, char...> void operator "" _b(); // expected-error {{parameter}}
-template<typename T> T operator "" _b(const char *); // expected-error {{parameter}}
-template<typename T> int operator "" _b(const T *, size_t); // expected-error {{parameter}}
+template<char... C, int N = 0> void operator "" _b(); // expected-error {{template}}
+template<char... C> void operator "" _b(int N = 0); // expected-error {{template}}
+template<char, char...> void operator "" _b(); // expected-error {{template}}
+template<typename T> T operator "" _b(const char *); // expected-error {{template}}
+template<typename T> int operator "" _b(const T *, size_t); // expected-error {{template}}
diff --git a/test/CXX/over/over.oper/over.literal/p8.cpp b/test/CXX/over/over.oper/over.literal/p8.cpp
index 70a184372cf59..6644bae7e610d 100644
--- a/test/CXX/over/over.oper/over.literal/p8.cpp
+++ b/test/CXX/over/over.oper/over.literal/p8.cpp
@@ -12,6 +12,6 @@ float operator ""E(const char *); // expected-error {{invalid suffix on literal}
 float operator " " B(const char *); // expected-error {{must be '""'}} expected-warning {{reserved}}
 string operator "" 5X(const char *, std::size_t); // expected-error {{expected identifier}}
 double operator "" _miles(double); // expected-error {{parameter}}
-template<char...> int operator "" j(const char*); // expected-error {{parameter}}
+template<char...> int operator "" j(const char*); // expected-error {{template}}
 
 float operator ""_E(const char *);
diff --git a/test/CXX/special/class.copy/implicit-move-def.cpp b/test/CXX/special/class.copy/implicit-move-def.cpp
index 880268d55ac49..f344b0cc6a0f3 100644
--- a/test/CXX/special/class.copy/implicit-move-def.cpp
+++ b/test/CXX/special/class.copy/implicit-move-def.cpp
@@ -1,4 +1,4 @@
-// FIXME: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK %s
+// FIXME: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK-ASSIGN %s
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -o - -std=c++11 %s | FileCheck -check-prefix=CHECK-CTOR %s
 
diff --git a/test/CXX/special/class.inhctor/p1.cpp b/test/CXX/special/class.inhctor/p1.cpp
index fa0416e11765c..c006abe3506fc 100644
--- a/test/CXX/special/class.inhctor/p1.cpp
+++ b/test/CXX/special/class.inhctor/p1.cpp
@@ -1,53 +1,55 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
-// Per a core issue (no number yet), an ellipsis is always dropped.
-struct A {
-  A(...); // expected-note {{here}}
-  A(int = 0, int = 0, int = 0, int = 0, ...); // expected-note 9{{here}} expected-note 2{{constructor cannot be inherited}}
-  A(int = 0, int = 0, ...); // expected-note {{here}}
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
-  template<typename T> A(T, int = 0, ...); // expected-note 5{{here}}
+struct A { // expected-note 8{{candidate is the implicit}}
+  A(...); // expected-note 4{{candidate constructor}} expected-note 4{{candidate inherited constructor}}
+  A(int = 0, int = 0, int = 0, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
+  A(int = 0, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
 
-  template<typename T, int N> A(const T (&)[N]); // expected-note 2{{here}} expected-note {{constructor cannot be inherited}}
-  template<typename T, int N> A(const T (&)[N], int = 0); // expected-note 2{{here}}
+  template<typename T> A(T, int = 0, ...); // expected-note 3{{candidate constructor}} expected-note 3{{candidate inherited constructor}}
+
+  template<typename T, int N> A(const T (&)[N]); // expected-note {{candidate constructor}} expected-note {{candidate inherited constructor}}
+  template<typename T, int N> A(const T (&)[N], int = 0); // expected-note {{candidate constructor}} expected-note {{candidate inherited constructor}}
 };
 
-struct B : A { // expected-note 6{{candidate}}
-  using A::A; // expected-warning 4{{inheriting constructor does not inherit ellipsis}} expected-note 16{{candidate}} expected-note 3{{deleted constructor was inherited here}}
+struct B : A { // expected-note 4{{candidate is the implicit}}
+  using A::A; // expected-note 19{{inherited here}}
+  B(void*);
 };
 
 struct C {} c;
 
-B b0{};
-// expected-error@-1 {{call to implicitly-deleted default constructor of 'B'}}
-// expected-note@-8 {{default constructor of 'B' is implicitly deleted because base class 'A' has multiple default constructors}}
+A a0{}; // expected-error {{ambiguous}}
+B b0{}; // expected-error {{ambiguous}}
 
-B b1{1};
-// expected-error@-1 {{call to deleted constructor of 'B'}}
+A a1{1}; // expected-error {{ambiguous}}
+B b1{1}; // expected-error {{ambiguous}}
 
-B b2{1,2};
-// expected-error@-1 {{call to deleted constructor of 'B'}}
+A a2{1,2}; // expected-error {{ambiguous}}
+B b2{1,2}; // expected-error {{ambiguous}}
 
-B b3{1,2,3};
-// ok
+A a3{1,2,3}; // ok
+B b3{1,2,3}; // ok
 
-B b4{1,2,3,4};
-// ok
+A a4{1,2,3,4}; // ok
+B b4{1,2,3,4}; // ok
 
-B b5{1,2,3,4,5};
-// expected-error@-1 {{no matching constructor for initialization of 'B'}}
+A a5{1,2,3,4,5}; // ok
+B b5{1,2,3,4,5}; // ok
 
-B b6{c};
-// ok
+A a6{c}; // ok
+B b6{c}; // ok
 
-B b7{c,0};
-// ok
+A a7{c,0}; // ok
+B b7{c,0}; // ok
 
-B b8{c,0,1};
-// expected-error@-1 {{no matching constructor}}
+A a8{c,0,1}; // ok
+B b8{c,0,1}; // ok
 
-B b9{"foo"};
-// FIXME: explain why the inheriting constructor was deleted
-// expected-error@-2 {{call to deleted constructor of 'B'}}
+A a9{"foo"}; // expected-error {{ambiguous}}
+B b9{"foo"}; // expected-error {{ambiguous}}
 
 namespace PR15755 {
   struct X {
diff --git a/test/CXX/special/class.inhctor/p2.cpp b/test/CXX/special/class.inhctor/p2.cpp
index d1c16ff886c38..f84dc6487573f 100644
--- a/test/CXX/special/class.inhctor/p2.cpp
+++ b/test/CXX/special/class.inhctor/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 template<int> struct X {};
 
@@ -8,10 +11,10 @@ template<int> struct X {};
 //   - absence or presence of explicit
 //   - absence or presence of constexpr
 struct A {
-  A(X<0>) {} // expected-note 2{{here}}
+  A(X<0>) {} // expected-note 4{{here}}
   constexpr A(X<1>) {}
-  explicit A(X<2>) {} // expected-note 3{{here}}
-  explicit constexpr A(X<3>) {} // expected-note 2{{here}}
+  explicit A(X<2>) {} // expected-note 6{{here}}
+  explicit constexpr A(X<3>) {} // expected-note 4{{here}}
 };
 
 A a0 { X<0>{} };
@@ -36,7 +39,7 @@ constexpr A a3ic = { X<3>{} }; // expected-error {{constructor is explicit}}
 
 
 struct B : A {
-  using A::A; // expected-note 7{{here}}
+  using A::A;
 };
 
 B b0 { X<0>{} };
@@ -62,14 +65,19 @@ constexpr B b3ic = { X<3>{} }; // expected-error {{constructor is explicit}}
 
 // 'constexpr' is OK even if the constructor doesn't obey the constraints.
 struct NonLiteral { NonLiteral(); };
-struct NonConstexpr { NonConstexpr(); constexpr NonConstexpr(int); }; // expected-note {{here}}
+struct NonConstexpr { NonConstexpr(); constexpr NonConstexpr(int); };
 struct Constexpr { constexpr Constexpr(int) {} };
 
 struct BothNonLiteral : NonLiteral, Constexpr { using Constexpr::Constexpr; }; // expected-note {{base class 'NonLiteral' of non-literal type}}
 constexpr BothNonLiteral bothNL{42}; // expected-error {{constexpr variable cannot have non-literal type 'const BothNonLiteral'}}
 
-struct BothNonConstexpr : NonConstexpr, Constexpr { using Constexpr::Constexpr; }; // expected-note {{non-constexpr constructor 'NonConstexpr}}
-constexpr BothNonConstexpr bothNC{42}; // expected-error {{must be initialized by a constant expression}} expected-note {{in call to 'BothNonConstexpr(42)'}}
+// FIXME: This diagnostic is not very good. We should explain that the problem is that base class NonConstexpr cannot be initialized.
+struct BothNonConstexpr
+    : NonConstexpr,
+      Constexpr {
+  using Constexpr::Constexpr; // expected-note {{here}}
+};
+constexpr BothNonConstexpr bothNC{42}; // expected-error {{must be initialized by a constant expression}} expected-note {{inherited from base class 'Constexpr'}}
 
 
 struct ConstexprEval {
@@ -87,25 +95,25 @@ static_assert(ce.k == 'a', "");
 static_assert(ce.k2 == 'x', "");
 
 
-struct TemplateCtors {
-  constexpr TemplateCtors() {}
-  template<template<int> class T> TemplateCtors(X<0>, T<0>);
-  template<int N> TemplateCtors(X<1>, X<N>);
-  template<typename T> TemplateCtors(X<2>, T);
+struct TemplateCtors { // expected-note 2{{candidate constructor (the implicit}}
+  constexpr TemplateCtors() {} // expected-note {{candidate inherited constructor}}
+  template<template<int> class T> TemplateCtors(X<0>, T<0>); // expected-note {{here}} expected-note {{candidate inherited constructor}}
+  template<int N> TemplateCtors(X<1>, X<N>); // expected-note {{here}} expected-note {{candidate inherited constructor}}
+  template<typename T> TemplateCtors(X<2>, T); // expected-note {{here}} expected-note {{candidate inherited constructor}}
 
-  template<typename T = int> TemplateCtors(int, int = 0, int = 0); // expected-note {{inherited from here}}
+  template<typename T = int> TemplateCtors(int, int = 0, int = 0);
 };
 
-struct UsingTemplateCtors : TemplateCtors {  // expected-note 2{{candidate is the implicit}}
-  using TemplateCtors::TemplateCtors; // expected-note 4{{here}} expected-note {{candidate}}
+struct UsingTemplateCtors : TemplateCtors { // expected-note 2{{candidate constructor (the implicit}}
+  using TemplateCtors::TemplateCtors; // expected-note 6{{inherited here}}
 
-  constexpr UsingTemplateCtors(X<0>, X<0>) {}
-  constexpr UsingTemplateCtors(X<1>, X<1>) {}
-  constexpr UsingTemplateCtors(X<2>, X<2>) {}
+  constexpr UsingTemplateCtors(X<0>, X<0>) {} // expected-note {{not viable}}
+  constexpr UsingTemplateCtors(X<1>, X<1>) {} // expected-note {{not viable}}
+  constexpr UsingTemplateCtors(X<2>, X<2>) {} // expected-note {{not viable}}
 
-  template<int = 0> constexpr UsingTemplateCtors(int) {} // expected-note {{candidate}}
-  template<typename T = void> constexpr UsingTemplateCtors(int, int) {}
-  template<typename T, typename U> constexpr UsingTemplateCtors(int, int, int) {}
+  template<int = 0> constexpr UsingTemplateCtors(int) {} // expected-note {{not viable}}
+  template<typename T = void> constexpr UsingTemplateCtors(int, int) {} // expected-note {{not viable}}
+  template<typename T, typename U> constexpr UsingTemplateCtors(int, int, int) {} // expected-note {{couldn't infer}}
 };
 
 template<int> struct Y {};
@@ -116,6 +124,10 @@ constexpr UsingTemplateCtors uct4{ X<1>{}, X<1>{} };
 constexpr UsingTemplateCtors uct5{ X<2>{}, 0 }; // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr}}
 constexpr UsingTemplateCtors uct6{ X<2>{}, X<2>{} };
 
-constexpr UsingTemplateCtors utc7{ 0 }; // expected-error {{ambiguous}}
+constexpr UsingTemplateCtors utc7{ 0 }; // ok
 constexpr UsingTemplateCtors utc8{ 0, 0 }; // ok
-constexpr UsingTemplateCtors utc9{ 0, 0, 0 }; // expected-error {{must be initialized by a constant expression}} expected-note {{non-constexpr}}
+// FIXME: The standard says that UsingTemplateCtors' (int, int, int) constructor
+// hides the one from TemplateCtors, even though the template parameter lists
+// don't match. It's not clear that that's *really* the intent, and it's not
+// what other compilers do.
+constexpr UsingTemplateCtors utc9{ 0, 0, 0 }; // expected-error {{no matching constructor}}
diff --git a/test/CXX/special/class.inhctor/p3.cpp b/test/CXX/special/class.inhctor/p3.cpp
index 7aaaa7a6f0b8b..7f054874e075a 100644
--- a/test/CXX/special/class.inhctor/p3.cpp
+++ b/test/CXX/special/class.inhctor/p3.cpp
@@ -1,8 +1,11 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 struct B1 {
-  B1(int);
-  B1(int, int);
+  B1(int); // expected-note 3{{target of using}}
+  B1(int, int); // expected-note 3{{target of using}}
 };
 struct D1 : B1 {
   using B1::B1;
@@ -11,48 +14,56 @@ D1 d1a(1), d1b(1, 1);
 
 D1 fd1() { return 1; }
 
-struct B2 {
+struct B2 { // expected-note 2{{candidate}}
   explicit B2(int, int = 0, int = 0);
 };
-struct D2 : B2 { // expected-note 2 {{candidate constructor}}
-  using B2::B2;
+struct D2 : B2 { // expected-note 2{{candidate constructor}}
+  using B2::B2; // expected-note 2{{inherited here}}
 };
 D2 d2a(1), d2b(1, 1), d2c(1, 1, 1);
 
 D2 fd2() { return 1; } // expected-error {{no viable conversion}}
 
-struct B3 {
-  B3(void*); // expected-note {{inherited from here}}
+struct B3 { // expected-note 2{{candidate}}
+  B3(void*); // expected-note {{candidate}}
 };
-struct D3 : B3 { // expected-note 2 {{candidate constructor}}
-  using B3::B3; // expected-note {{candidate constructor (inherited)}}
+struct D3 : B3 { // expected-note 2{{candidate constructor}}
+  using B3::B3; // expected-note 3{{inherited here}}
 };
 D3 fd3() { return 1; } // expected-error {{no viable conversion}}
 
 template<typename T> struct T1 : B1 {
-  using B1::B1;
+  using B1::B1; // expected-note 2{{using declaration}}
 };
 template<typename T> struct T2 : T1<T> {
-  using T1<int>::T1;
+  using T1<int>::T1; // expected-note 2{{using declaration}}
 };
 template<typename T> struct T3 : T1<int> {
-  using T1<T>::T1;
+  using T1<T>::T1; // expected-note 2{{using declaration}}
 };
 struct U {
-  friend T1<int>::T1(int);
-  friend T1<int>::T1(int, int);
-  friend T2<int>::T2(int);
-  friend T2<int>::T2(int, int);
-  friend T3<int>::T3(int);
-  friend T3<int>::T3(int, int);
+  // [dcl.meaning]p1: "the member shall not merely hav ebeen introduced by a
+  // using-declaration in the scope of the class [...] nominated by the
+  // nested-name-specifier of the declarator-id"
+  friend T1<int>::T1(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T1<int>::T1(int, int); // expected-error {{cannot befriend target of using declaration}}
+  friend T2<int>::T2(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T2<int>::T2(int, int); // expected-error {{cannot befriend target of using declaration}}
+  friend T3<int>::T3(int); // expected-error {{cannot befriend target of using declaration}}
+  friend T3<int>::T3(int, int); // expected-error {{cannot befriend target of using declaration}}
 };
 
 struct B4 {
-  template<typename T> explicit B4(T, int = 0);
+  template<typename T> explicit B4(T, int = 0); // expected-note 2{{here}}
 };
 template<typename T> struct T4 : B4 {
-  using B4::B4; // expected-note {{here}}
+  using B4::B4;
   template<typename U> T4(U);
 };
+template<typename T> struct U4 : T4<T> {
+  using T4<T>::T4;
+};
 T4<void> t4a = {0};
 T4<void> t4b = {0, 0}; // expected-error {{chosen constructor is explicit}}
+U4<void> u4a = {0};
+U4<void> u4b = {0, 0}; // expected-error {{chosen constructor is explicit}}
diff --git a/test/CXX/special/class.inhctor/p4.cpp b/test/CXX/special/class.inhctor/p4.cpp
index ae1f7a57031f1..69fbea3e0eb06 100644
--- a/test/CXX/special/class.inhctor/p4.cpp
+++ b/test/CXX/special/class.inhctor/p4.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 template<int> struct X {};
 
@@ -8,20 +11,20 @@ struct A {
 public:
   A(X<0>) {}
 protected:
-  A(X<1>) {}
+  A(X<1>) {} // expected-note 2{{declared protected here}}
 private:
-  A(X<2>) {} // expected-note {{declared private here}}
+  A(X<2>) {} // expected-note 2{{declared private here}}
   friend class FA;
 };
 
 struct B : A {
-  using A::A; // expected-error {{private constructor}} expected-note {{implicitly declared protected here}}
+  using A::A;
   friend class FB;
 };
 
 B b0{X<0>{}};
 B b1{X<1>{}}; // expected-error {{calling a protected constructor}}
-B b2{X<2>{}}; // expected-note {{first required here}}
+B b2{X<2>{}}; // expected-error {{calling a private constructor}}
 
 struct C : B {
   C(X<0> x) : B(x) {}
@@ -34,7 +37,7 @@ struct FB {
 };
 
 struct FA : A {
-  using A::A; // expected-note 2{{here}}
+  using A::A;
 };
 FA fa0{X<0>{}};
 FA fa1{X<1>{}}; // expected-error {{calling a protected constructor}}
@@ -47,7 +50,7 @@ struct G {
   template<typename T> G(T*) = delete; // expected-note {{'G<const char>' has been explicitly marked deleted here}}
 };
 struct H : G {
-  using G::G; // expected-note 2{{deleted constructor was inherited here}}
+  using G::G;
 };
 H h1(5); // expected-error {{call to deleted constructor of 'H'}}
 H h2("foo"); // expected-error {{call to deleted constructor of 'H'}}
@@ -57,15 +60,15 @@ H h2("foo"); // expected-error {{call to deleted constructor of 'H'}}
 // same signature.
 namespace DRnnnn {
   struct A {
-    constexpr A(int, float = 0) {}
-    explicit A(int, int = 0) {} // expected-note {{constructor cannot be inherited}}
+    constexpr A(int, float = 0) {} // expected-note {{candidate}}
+    explicit A(int, int = 0) {} // expected-note {{candidate}}
 
-    A(int, int, int = 0) = delete;
+    A(int, int, int = 0) = delete; // expected-note {{deleted}}
   };
   struct B : A {
-    using A::A; // expected-note {{here}}
+    using A::A; // expected-note 3{{inherited here}}
   };
 
   constexpr B b0(0, 0.0f); // ok, constexpr
-  B b1(0, 1); // expected-error {{call to deleted constructor of 'DRnnnn::B'}}
+  B b1(0, 1); // expected-error {{call to constructor of 'DRnnnn::B' is ambiguous}}
 }
diff --git a/test/CXX/special/class.inhctor/p7.cpp b/test/CXX/special/class.inhctor/p7.cpp
index a57e8558f5cbb..c22a43a618987 100644
--- a/test/CXX/special/class.inhctor/p7.cpp
+++ b/test/CXX/special/class.inhctor/p7.cpp
@@ -1,47 +1,48 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
-// Straight from the standard
-struct B1 {
-  B1(int); // expected-note {{previous constructor}} expected-note {{conflicting constructor}}
+struct B1 { // expected-note 2{{candidate}}
+  B1(int); // expected-note {{candidate}}
 };
-struct B2 {
-  B2(int); // expected-note {{conflicting constructor}}
+struct B2 { // expected-note 2{{candidate}}
+  B2(int); // expected-note {{candidate}}
 };
-struct D1 : B1, B2 {
-  using B1::B1; // expected-note {{inherited here}}
-  using B2::B2; // expected-error {{already inherited constructor with the same signature}}
+struct D1 : B1, B2 { // expected-note 2{{candidate}}
+  using B1::B1; // expected-note 3{{inherited here}}
+  using B2::B2; // expected-note 3{{inherited here}}
 };
 struct D2 : B1, B2 {
   using B1::B1;
   using B2::B2;
   D2(int);
 };
+D1 d1(0); // expected-error {{ambiguous}}
+D2 d2(0);
 
 template<typename T> struct B3 {
-  B3(T); // expected-note {{previous constructor}}
+  B3(T);
 };
 template<typename T> struct B4 : B3<T>, B1 {
   B4();
-  using B3<T>::B3; // expected-note {{inherited here}}
-  using B1::B1; // expected-error {{already inherited}}
+  using B3<T>::B3;
+  using B1::B1;
 };
 B4<char> b4c;
-B4<int> b4i; // expected-note {{here}}
+B4<int> b4i;
 
 struct B5 {
-  template<typename T> B5(T); // expected-note {{previous constructor}}
-};
-struct B6 {
-  template<typename T> B6(T); // expected-note {{conflicting constructor}}
+  template<typename T> B5(T);
 };
-struct B7 {
-  template<typename T, int> B7(T);
-};
-struct D56 : B5, B6, B7 {
-  using B5::B5; // expected-note {{inherited here}}
-  using B6::B6; // expected-error {{already inherited}}
+struct D6 : B5 {
+  using B5::B5;
+  template<typename T> D6(T);
 };
-struct D57 : B5, B6, B7 {
+D6 d6(0);
+struct D7 : B5 {
   using B5::B5;
-  using B7::B7; // ok, not the same signature
+  template<typename T> D7(T, ...);
 };
+// DRxxx (no number yet): derived class ctor beats base class ctor.
+D7 d7(0);
diff --git a/test/CXX/special/class.inhctor/p8.cpp b/test/CXX/special/class.inhctor/p8.cpp
index effc2c3bca24a..58c01d2b912d4 100644
--- a/test/CXX/special/class.inhctor/p8.cpp
+++ b/test/CXX/special/class.inhctor/p8.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -std=c++11 -verify %s
+//
+// Note: [class.inhctor] was removed by P0136R1. This tests the new behavior
+// for the wording that used to be there.
 
 struct A {
   constexpr A(const int&) : rval(false) {}
@@ -13,8 +16,6 @@ constexpr int k = 0;
 constexpr A a0{0};
 constexpr A a1{k};
 constexpr B b0{0};
-// This performs static_cast<(const int&)&&>(k), so calls the A(const int&)
-// constructor.
 constexpr B b1{k};
 
 static_assert(a0.rval && !a1.rval && b0.rval && !b1.rval, "");
@@ -28,5 +29,4 @@ struct D : C {
 };
 static_assert(D(123).v == 123, "");
 
-// FIXME: This diagnostic sucks.
-template<typename T> constexpr D::D(T t) : C(t) {} // expected-error {{definition of implicitly declared function}}
+template<typename T> constexpr D::D(T t) : C(t) {} // expected-error {{does not match any declaration in 'D'}}
diff --git a/test/CXX/special/class.init/class.inhctor.init/p1.cpp b/test/CXX/special/class.init/class.inhctor.init/p1.cpp
new file mode 100644
index 0000000000000..e07d879df8f1c
--- /dev/null
+++ b/test/CXX/special/class.init/class.inhctor.init/p1.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+namespace std_example {
+  struct B1 {
+    B1(int, ...) {}
+  };
+
+  struct B2 {
+    B2(double) {}
+  };
+
+  int get();
+
+  struct D1 : B1 { // expected-note {{no default constructor}}
+    using B1::B1; // inherits B1(int, ...)
+    int x;
+    int y = get();
+  };
+
+  void test() {
+    D1 d(2, 3, 4); // OK: B1 is initialized by calling B1(2, 3, 4),
+    // then d.x is default-initialized (no initialization is performed),
+    // then d.y is initialized by calling get()
+    D1 e; // expected-error {{implicitly-deleted}}
+  }
+
+  struct D2 : B2 {
+    using B2::B2;
+    B1 b; // expected-note {{constructor inherited by 'D2' is implicitly deleted because field 'b' has no default constructor}}
+  };
+
+  D2 f(1.0); // expected-error {{constructor inherited by 'D2' from base class 'B2' is implicitly deleted}}
+
+  struct W {
+    W(int);
+  };
+  struct X : virtual W {
+    using W::W;
+    X() = delete;
+  };
+  struct Y : X {
+    using X::X;
+  };
+  struct Z : Y, virtual W {
+    using Y::Y;
+  };
+  Z z(0); // OK: initialization of Y does not invoke default constructor of X
+
+  template <class T> struct Log : T {
+    using T::T; // inherits all constructors from class T
+    ~Log() { /* ... */ }
+  };
+}
+
+namespace vbase {
+  struct V {
+    V(int);
+  };
+
+  struct A : virtual V {
+    A() = delete; // expected-note 2{{deleted here}} expected-note {{deleted}}
+    using V::V;
+  };
+  struct B : virtual V { // expected-note {{no default constructor}}
+    B() = delete; // expected-note 2{{deleted here}}
+    B(int, int);
+    using V::V;
+  };
+  struct C : B { // expected-note {{deleted default constructor}}
+    using B::B;
+  };
+  struct D : A, C { // expected-note {{deleted default constructor}} expected-note {{deleted corresponding constructor}}
+    using A::A;
+    using C::C;
+  };
+
+  A a0; // expected-error {{deleted}}
+  A a1(0);
+  B b0; // expected-error {{deleted}}
+  B b1(0);
+  B b2(0, 0);
+  C c0; // expected-error {{deleted}}
+  C c1(0);
+  C c2(0, 0); // expected-error {{deleted}}
+  D d0; // expected-error {{deleted}}
+  D d1(0);
+  D d2(0, 0); // expected-error {{deleted}}
+}
+
+namespace constexpr_init_order {
+  struct Param;
+  struct A {
+    constexpr A(Param);
+    int a;
+  };
+
+  struct B : A { B(); using A::A; int b = 2; };
+  extern const B b;
+
+  struct Param {
+    constexpr Param(int c) : n(4 * b.a + b.b + c) {}
+    int n;
+  };
+
+  constexpr A::A(Param p) : a(p.n) {}
+
+  constexpr B b(1);
+  constexpr B c(1);
+  static_assert(b.a == 1, "p should be initialized before B() is executed");
+  static_assert(c.a == 7, "b not initialzed properly");
+}
+
+namespace default_args {
+  // We work around a defect in P0136R1 where it would reject reasonable
+  // code like the following:
+  struct Base {
+    Base(int = 0);
+  };
+  struct Derived : Base {
+    using Base::Base;
+  };
+  Derived d;
+  // FIXME: Once a fix is standardized, implement it.
+}
diff --git a/test/CXX/special/class.init/class.inhctor.init/p2.cpp b/test/CXX/special/class.init/class.inhctor.init/p2.cpp
new file mode 100644
index 0000000000000..7ea2ccc3605f4
--- /dev/null
+++ b/test/CXX/special/class.init/class.inhctor.init/p2.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+namespace std_example {
+  struct A { A(int); };
+  struct B : A { using A::A; };
+
+  struct C1 : B { using B::B; };
+  struct C2 : B { using B::B; };
+
+  struct D1 : C1, C2 {
+    using C1::C1; // expected-note {{inherited from base class 'C1' here}}
+    using C2::C2; // expected-note {{inherited from base class 'C2' here}}
+  };
+
+  struct V1 : virtual B { using B::B; };
+  struct V2 : virtual B { using B::B; };
+
+  struct D2 : V1, V2 {
+    using V1::V1;
+    using V2::V2;
+  };
+
+  D1 d1(0); // expected-error {{constructor of 'A' inherited from multiple base class subobjects}}
+  D2 d2(0); // OK: initializes virtual B base class, which initializes the A base class
+            // then initializes the V1 and V2 base classes as if by a defaulted default constructor
+
+  struct M { M(); M(int); };
+  struct N : M { using M::M; };
+  struct O : M {};
+  struct P : N, O { using N::N; using O::O; };
+  P p(0); // OK: use M(0) to initialize N's base class,
+          // use M() to initialize O's base class
+}
diff --git a/test/CXX/stmt.stmt/stmt.dcl/p3.cpp b/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
index 4bcc648e84aa5..03c835b21a623 100644
--- a/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
+++ b/test/CXX/stmt.stmt/stmt.dcl/p3.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // PR10034
 struct X {};
@@ -40,8 +42,16 @@ struct Z {
 };
 
 void test_Z() {
-  goto end; // expected-error{{cannot jump from this goto statement to its label}}
-  Z z; // expected-note{{jump bypasses initialization of non-POD variable}}
+  goto end;
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{cannot jump from this goto statement to its label}}
+#endif
+
+  Z z;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{jump bypasses initialization of non-POD variable}}
+#endif
+
  end:
   return;
 }
diff --git a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
index 7d689ae0b1c19..8c4f36c0ff740 100644
--- a/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
+++ b/test/CXX/stmt.stmt/stmt.iter/stmt.ranged/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify %s
 
 struct pr12960 {
   int begin;
@@ -118,10 +120,15 @@ void g() {
     ;
 
   struct Differ {
-    int *begin(); // expected-note {{selected 'begin' function with iterator type 'int *'}}
-    null_t end(); // expected-note {{selected 'end' function with iterator type 'null_t'}}
+    int *begin();
+    null_t end();
   };
-  for (auto a : Differ()) // expected-error {{'begin' and 'end' must return the same type (got 'int *' and 'null_t')}}
+  for (auto a : Differ())
+#if __cplusplus <= 201402L
+    // expected-warning@-2 {{'begin' and 'end' returning different types ('int *' and 'null_t') is a C++1z extension}}
+    // expected-note@-6 {{selected 'begin' function with iterator type 'int *'}}
+    // expected-note@-6 {{selected 'end' function with iterator type 'null_t'}}
+#endif
     ;
 
   for (void f() : "error") // expected-error {{for range declaration must declare a variable}}
@@ -129,7 +136,7 @@ void g() {
 
   for (extern int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'extern'}}
   for (static int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'static'}}
-  for (register int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'register'}} expected-warning {{deprecated}}
+  for (register int a : A()) {} // expected-error {{loop variable 'a' may not be declared 'register'}} expected-warning 0-1{{register}} expected-error 0-1{{register}}
   for (constexpr int a : X::C()) {} // OK per CWG issue #1204.
 
   for (auto u : X::NoBeginADL()) { // expected-error {{invalid range expression of type 'X::NoBeginADL'; no viable 'begin' function available}}
diff --git a/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp b/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
new file mode 100644
index 0000000000000..d6a2169a5baf9
--- /dev/null
+++ b/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
@@ -0,0 +1,137 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+// RUN: %clang_cc1 -std=c++1z -verify %s -DUNDEFINED
+
+#ifdef UNDEFINED
+// "used but not defined" errors don't get produced if we have more interesting
+// errors.
+namespace std_example {
+  template <typename T, typename... Rest> void g(T &&p, Rest &&... rs) {
+    // use p
+    if constexpr(sizeof...(rs) > 0)
+      g(rs...);
+  }
+  void use_g() {
+    g(1, 2, 3);
+  }
+
+  static int x(); // no definition of x required
+  int f() {
+    if constexpr (true)
+      return 0;
+    else if (x())
+      return x();
+    else
+      return -x();
+  }
+}
+
+namespace odr_use_in_selected_arm {
+  static int x(); // expected-warning {{is not defined}}
+  int f() {
+    if constexpr (false)
+      return 0;
+    else if (x()) // expected-note {{here}}
+      return x();
+    else
+      return -x();
+  }
+}
+#else
+namespace ccce {
+  void f() {
+    if (5) {}
+    if constexpr (5) {} // expected-error {{cannot be narrowed}}
+  }
+  template<int N> void g() {
+    if constexpr (N) {} // expected-error {{cannot be narrowed}}
+  }
+  template void g<5>(); // expected-note {{instantiation of}}
+}
+
+namespace generic_lambda {
+  // Substituting for T produces a hard error here, even if substituting for
+  // the type of x would remove the error.
+  template<typename T> void f() {
+    [](auto x) {
+      if constexpr (sizeof(T) == 1 && sizeof(x) == 1)
+        T::error(); // expected-error 2{{'::'}}
+    } (0);
+  }
+
+  template<typename T> void g() {
+    [](auto x) {
+      if constexpr (sizeof(T) == 1)
+        if constexpr (sizeof(x) == 1)
+          T::error(); // expected-error {{'::'}}
+    } (0);
+  }
+
+  void use() {
+    f<int>(); // expected-note {{instantiation of}}
+    f<char>(); // expected-note {{instantiation of}}
+    g<int>(); // ok
+    g<char>(); // expected-note {{instantiation of}}
+  }
+}
+
+namespace potentially_discarded_branch_target {
+  void in_switch(int n) {
+    switch (n)
+      case 4: if constexpr(sizeof(n) == 4) return;
+    if constexpr(sizeof(n) == 4)
+      switch (n) case 4: return;
+    switch (n) {
+      if constexpr (sizeof(n) == 4) // expected-note 2{{constexpr if}}
+        case 4: return; // expected-error {{cannot jump}}
+      else
+        default: break; // expected-error {{cannot jump}}
+    }
+  }
+
+  template<typename T>
+  void in_switch_tmpl(int n) {
+    switch (n) {
+      if constexpr (sizeof(T) == 4) // expected-note 2{{constexpr if}}
+        case 4: return; // expected-error {{cannot jump}}
+      else
+        default: break; // expected-error {{cannot jump}}
+    }
+  }
+
+  void goto_scope(int n) {
+    goto foo; // expected-error {{cannot jump}}
+    if constexpr(sizeof(n) == 4) // expected-note {{constexpr if}}
+      foo: return;
+bar:
+    if constexpr(sizeof(n) == 4)
+      goto bar; // ok
+  }
+
+  template<typename T>
+  void goto_scope(int n) {
+    goto foo; // expected-error {{cannot jump}}
+    if constexpr(sizeof(n) == 4) // expected-note {{constexpr if}}
+      foo: return;
+bar:
+    if constexpr(sizeof(n) == 4)
+      goto bar; // ok
+  }
+
+  void goto_redef(int n) {
+a:  if constexpr(sizeof(n) == 4) // expected-error {{redefinition}} expected-note {{constexpr if}}
+      a: goto a; // expected-note 2{{previous}}
+    else
+      a: goto a; // expected-error {{redefinition}} expected-error {{cannot jump}}
+  }
+
+  void evil_things() {
+    goto evil_label; // expected-error {{cannot jump}}
+    if constexpr (true || ({evil_label: false;})) {} // expected-note {{constexpr if}}
+
+    if constexpr (true) // expected-note {{constexpr if}}
+      goto surprise; // expected-error {{cannot jump}}
+    else
+      surprise: {}
+  }
+}
+#endif
diff --git a/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp b/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
index 22ea018842d47..58290ac40fdb0 100644
--- a/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
+++ b/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple=x86_64-linux-gnu -std=c++11 %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -triple=x86_64-linux-gnu %s -DCPP11ONLY
 
 // C++11 [temp.arg.nontype]p1:
@@ -31,43 +33,103 @@ namespace non_type_tmpl_param {
 //      if the corresopnding template-parameter is a reference; or
 namespace addr_of_obj_or_func {
   template <int* p> struct X0 { }; // expected-note 5{{here}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 2{{template parameter is declared here}}
+#endif
+
   template <int (*fp)(int)> struct X1 { };
   template <int &p> struct X2 { }; // expected-note 4{{here}}
   template <const int &p> struct X2k { }; // expected-note {{here}}
   template <int (&fp)(int)> struct X3 { }; // expected-note 4{{here}}
 
   int i = 42;
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
+
   int iarr[10];
   int f(int i);
-  const int ki = 9; // expected-note 5{{here}}
-  __thread int ti = 100; // expected-note 2{{here}}
-  static int f_internal(int); // expected-note 4{{here}}
+  const int ki = 9;
+#if __cplusplus <= 199711L
+  // expected-note@-2 5{{non-type template argument refers to object here}}
+#endif
+
+  __thread int ti = 100; // expected-note {{here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{here}}
+#endif
+
+  static int f_internal(int);
+#if __cplusplus <= 199711L
+  // expected-note@-2 4{{non-type template argument refers to function here}}
+#endif
+
   template <typename T> T f_tmpl(T t);
   struct S { union { int NonStaticMember; }; };
 
   void test() {
-    X0<i> x0a; // expected-error {{must have its address taken}}
+    X0<i> x0a;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int' is not a constant expression}}
+    // expected-note@-5 {{read of non-const variable 'i' is not allowed in a constant expression}}
+#endif
     X0<&i> x0a_addr;
     X0<iarr> x0b;
     X0<&iarr> x0b_addr; // expected-error {{cannot be converted to a value of type 'int *'}}
-    X0<ki> x0c; // expected-error {{must have its address taken}} expected-warning {{internal linkage is a C++11 extension}}
-    X0<&ki> x0c_addr; // expected-error {{cannot be converted to a value of type 'int *'}} expected-warning {{internal linkage is a C++11 extension}}
-    X0<&ti> x0d_addr; // expected-error {{refers to thread-local object}}
+    X0<ki> x0c; // expected-error {{must have its address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X0<&ki> x0c_addr; // expected-error {{cannot be converted to a value of type 'int *'}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X0<&ti> x0d_addr;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument refers to thread-local object}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+#endif
+
     X1<f> x1a;
     X1<&f> x1a_addr;
     X1<f_tmpl> x1b;
     X1<&f_tmpl> x1b_addr;
     X1<f_tmpl<int> > x1c;
     X1<&f_tmpl<int> > x1c_addr;
-    X1<f_internal> x1d; // expected-warning {{internal linkage is a C++11 extension}}
-    X1<&f_internal> x1d_addr; // expected-warning {{internal linkage is a C++11 extension}}
+    X1<f_internal> x1d;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X1<&f_internal> x1d_addr;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
     X2<i> x2a;
     X2<&i> x2a_addr; // expected-error {{address taken}}
     X2<iarr> x2b; // expected-error {{cannot bind to template argument of type 'int [10]'}}
     X2<&iarr> x2b_addr; // expected-error {{address taken}}
-    X2<ki> x2c; // expected-error {{ignores qualifiers}} expected-warning {{internal linkage is a C++11 extension}}
-    X2k<ki> x2kc; // expected-warning {{internal linkage is a C++11 extension}}
-    X2k<&ki> x2kc_addr; // expected-error {{address taken}} expected-warning {{internal linkage is a C++11 extension}}
+    X2<ki> x2c; // expected-error {{ignores qualifiers}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X2k<ki> x2kc;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X2k<&ki> x2kc_addr; // expected-error {{address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
     X2<ti> x2d_addr; // expected-error {{refers to thread-local object}}
     X3<f> x3a;
     X3<&f> x3a_addr; // expected-error {{address taken}}
@@ -75,11 +137,31 @@ namespace addr_of_obj_or_func {
     X3<&f_tmpl> x3b_addr; // expected-error {{address taken}}
     X3<f_tmpl<int> > x3c;
     X3<&f_tmpl<int> > x3c_addr; // expected-error {{address taken}}
-    X3<f_internal> x3d; // expected-warning {{internal linkage is a C++11 extension}}
-    X3<&f_internal> x3d_addr; // expected-error {{address taken}} expected-warning {{internal linkage is a C++11 extension}}
+    X3<f_internal> x3d;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    X3<&f_internal> x3d_addr; // expected-error {{address taken}}
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{internal linkage is a C++11 extension}}
+#endif
+
+    int n;
+#if __cplusplus <= 199711L
+    // expected-note@-2 {{non-type template argument refers to object here}}
+#else
+    // expected-note@-4 {{declared here}}
+#endif
+
+    X0<&n> x0_no_linkage;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{non-type template argument refers to object 'n' that does not have linkage}}
+#else
+    // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+    // expected-note@-5 {{pointer to 'n' is not a constant expression}}
+#endif
 
-    int n; // expected-note {{here}}
-    X0<&n> x0_no_linkage; // expected-error {{non-type template argument refers to object 'n' that does not have linkage}}
     struct Local { static int f() {} }; // expected-note {{here}}
     X1<&Local::f> x1_no_linkage; // expected-error {{non-type template argument refers to function 'f' that does not have linkage}}
     X0<&S::NonStaticMember> x0_non_static; // expected-error {{non-static data member}}
@@ -96,16 +178,26 @@ namespace bad_args {
   int i = 42;
   X0<&i + 2> x0a; // expected-error{{non-type template argument does not refer to any declaration}}
   int* iptr = &i;
-  X0<iptr> x0b; // expected-error{{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
+
+  X0<iptr> x0b;
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{non-type template argument for template parameter of pointer type 'int *' must have its address taken}}
+#else
+  // expected-error@-4 {{non-type template argument of type 'int *' is not a constant expression}}
+  // expected-note@-5 {{read of non-constexpr variable 'iptr' is not allowed in a constant expression}}
+#endif
 }
 #endif // CPP11ONLY
 
 namespace default_args {
 #ifdef CPP11ONLY
 namespace lambdas {
-template<int I = ([] { return 5; }())> //expected-error 2{{constant expression}} expected-note{{constant expression}}
+template<int I = ([] { return 5; }())> //expected-error {{constant expression}}
 int f();
 }
 #endif // CPP11ONLY
 
-}
-\ No newline at end of file
+}
diff --git a/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp b/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
index 0fd9a7e884b98..539baecdb6a5b 100644
--- a/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
+++ b/test/CXX/temp/temp.arg/temp.arg.type/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<class T> struct A {
   static T t; // expected-error{{static data member instantiated with function type 'int ()'}}
 };
@@ -11,10 +14,17 @@ template<typename T> struct B {
 B<function> b; // expected-note{{instantiation of}}
 
 template <typename T> int f0(void *, const T&); // expected-note{{candidate template ignored: substitution failure}}
-enum {e}; // expected-note{{unnamed type used in template argument was declared here}}
+enum {e};
+#if __cplusplus <= 199711L
+// expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
 
 void test_f0(int n) {
-  int i = f0(0, e); // expected-warning{{template argument uses unnamed type}}
+  int i = f0(0, e);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
   int vla[n];
   f0(0, vla); // expected-error{{no matching function for call to 'f0'}}
 }
@@ -23,20 +33,50 @@ namespace N0 {
   template <typename R, typename A1> void f0(R (*)(A1));
   template <typename T> int f1(T);
   template <typename T, typename U> int f1(T, U);
-  enum {e1}; // expected-note 2{{unnamed type used in template argument was declared here}}
-  enum {e2}; // expected-note 2{{unnamed type used in template argument was declared here}}
-  enum {e3}; // expected-note{{unnamed type used in template argument was declared here}}
+  enum {e1};
+#if __cplusplus <= 199711L
+  // expected-note@-2 2{{unnamed type used in template argument was declared here}}
+#endif
+
+  enum {e2};
+#if __cplusplus <= 199711L
+  // expected-note@-2 2{{unnamed type used in template argument was declared here}}
+#endif
+
+  enum {e3};
+#if __cplusplus <= 199711L
+ // expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
 
   template<typename T> struct X;
   template<typename T> struct X<T*> { };
 
   void f() {
-    f0( // expected-warning{{template argument uses unnamed type}}
-       &f1<__typeof__(e1)>); // expected-warning{{template argument uses unnamed type}}
-    int (*fp1)(int, __typeof__(e2)) = f1; // expected-warning{{template argument uses unnamed type}}
-    f1(e2); // expected-warning{{template argument uses unnamed type}}
+    f0(
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+       &f1<__typeof__(e1)>);
+#if __cplusplus <= 199711L
+ // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+    int (*fp1)(int, __typeof__(e2)) = f1;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
+    f1(e2);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
+
     f1(e2);
 
-    X<__typeof__(e3)*> x; // expected-warning{{template argument uses unnamed type}}
+    X<__typeof__(e3)*> x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
   }
 }
diff --git a/test/CXX/temp/temp.decls/temp.mem/p1.cpp b/test/CXX/temp/temp.decls/temp.mem/p1.cpp
index 01eab24757f07..b48e145e1468d 100644
--- a/test/CXX/temp/temp.decls/temp.mem/p1.cpp
+++ b/test/CXX/temp/temp.decls/temp.mem/p1.cpp
@@ -10,6 +10,7 @@ template <class T> struct A {
     }
   };
 };
+extern template bool A<bool>::cond;
 
 int foo() {
   A<bool>::cond = true;
diff --git a/test/CXX/temp/temp.decls/temp.mem/p2.cpp b/test/CXX/temp/temp.decls/temp.mem/p2.cpp
index c24d5a9b50de2..feeb362e34b40 100644
--- a/test/CXX/temp/temp.decls/temp.mem/p2.cpp
+++ b/test/CXX/temp/temp.decls/temp.mem/p2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
 
 template <typename>
 void quux();
@@ -8,5 +8,7 @@ void fun() {
     template <typename> struct bar {};  // expected-error{{templates cannot be declared inside of a local class}}
     template <typename> void baz() {}   // expected-error{{templates cannot be declared inside of a local class}}
     template <typename> void qux();     // expected-error{{templates cannot be declared inside of a local class}}
+    template <typename> using corge = int; // expected-error{{templates cannot be declared inside of a local class}}
+    template <typename T> static T grault; // expected-error{{static data member}} expected-error{{templates cannot be declared inside of a local class}}
   };
 }
diff --git a/test/CXX/temp/temp.decls/temp.variadic/p5.cpp b/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
index 4f9368f6b6050..206e9f73e9f05 100644
--- a/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
+++ b/test/CXX/temp/temp.decls/temp.variadic/p5.cpp
@@ -437,3 +437,35 @@ namespace PR21289 {
   template void g<>();
   template void g<1, 2, 3>();
 }
+
+template <class... Ts>
+int var_expr(Ts... ts);
+
+template <class... Ts>
+auto a_function(Ts... ts) -> decltype(var_expr(ts...));
+
+template <class T>
+using partial = decltype(a_function<int, T>);
+
+int use_partial() { partial<char> n; }
+
+namespace PR26017 {
+template <class T>
+struct Foo {};
+template <class... Ts>
+using FooAlias = Foo<void(Ts...)>;
+
+template <class... Ts>
+using FooAliasAlias = FooAlias<Ts..., Ts...>;
+
+template <class... Ts>
+void bar(const FooAlias<Ts...> &) {}
+
+int fn() {
+  FooAlias<> a;
+  bar(a);
+
+  FooAlias<int> b;
+  bar(b);
+}
+}
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp
new file mode 100644
index 0000000000000..bc074ba25e8ef
--- /dev/null
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/p7.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+struct Q { typedef int type; };
+
+// "The substitution occurs in all types and expressions that are used in [...]
+// template parameter declarations." In particular, we must substitute into the
+// type of a parameter pack that is not a pack expansion, even if we know the
+// corresponding argument pack is empty.
+template<typename T, typename T::type...> void a(T);
+int &a(...);
+int &a_disabled = a(0);
+int &a_enabled = a(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
+
+template<typename T, template<typename T::type> class ...X> void b(T);
+int &b(...);
+int &b_disabled = b(0);
+int &b_enabled = b(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
+
+template<typename T, template<typename T::type...> class ...X> void c(T);
+int &c(...);
+int &c_disabled = c(0);
+int &c_enabled = c(Q()); // expected-error {{cannot bind to a temporary of type 'void'}}
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
index c27261c96b68c..9fd3df59d103e 100644
--- a/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/p9.cpp
@@ -1,9 +1,22 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
-template <int> int f(int);  // expected-note 2{{candidate}}
-template <signed char> int f(int); // expected-note 2{{candidate}}
-int i1 = f<1>(0); // expected-error{{ambiguous}}
-int i2 = f<1000>(0); // expected-error{{ambiguous}}
+template <int> int f(int);  // expected-note {{candidate function}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{candidate function}}
+#endif
+
+template <signed char> int f(int); // expected-note {{candidate function}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{candidate function}}
+#endif
+
+int i1 = f<1>(0); // expected-error{{call to 'f' is ambiguous}}
+int i2 = f<1000>(0);
+#if __cplusplus <= 199711L
+// expected-error@-2{{call to 'f' is ambiguous}}
+#endif
 
 namespace PR6707 {
   template<typename T, T Value>
diff --git a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
index f46ea2e585dc1..ff8178f947691 100644
--- a/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
+++ b/test/CXX/temp/temp.fct.spec/temp.deduct/temp.deduct.call/p3.cpp
@@ -146,3 +146,30 @@ namespace PR9233 {
   }
 
 }
+
+namespace PR27155 {
+
+struct B {};
+
+template<class T, int i> struct D : T {};
+template<class T> void Foo(D<T, 1>);
+
+int fn() {
+  D<D<B, 1>, 0> f;
+  Foo(f);
+}
+
+}
+
+namespace PR28195 {
+
+template<int N> struct B {};
+struct D : B<0>, B<1> {};
+
+template<int N> int callee(B<N>); // expected-note{{failed template argument deduction}}
+
+int caller() {
+  callee(D()); // expected-error{{no matching function}}
+}
+
+}
diff --git a/test/CXX/temp/temp.param/p15-cxx0x.cpp b/test/CXX/temp/temp.param/p15-cxx0x.cpp
index ade192b3efa94..667152da1cbcd 100644
--- a/test/CXX/temp/temp.param/p15-cxx0x.cpp
+++ b/test/CXX/temp/temp.param/p15-cxx0x.cpp
@@ -102,10 +102,10 @@ using D1 = drop<3, int, char, double, long>::type;
 using D1 = types<long>;
 
 using T2 = take<4, int, char, double, long>::type; // expected-note {{previous}}
-using T2 = types<int, char, double, long>;
 // FIXME: Desguar the types on the RHS in this diagnostic.
 // desired-error {{'types<void, void, void, void>' vs 'types<int, char, double, long>'}}
 using T2 = types<void, void, void, void>; // expected-error {{'types<void, void, void, void>' vs 'types<typename inner<_>::type, typename inner<_>::type, typename inner<_>::type, typename inner<_>::type>'}}
+using T2 = types<int, char, double, long>;
 using D2 = drop<4, int, char, double, long>::type;
 using D2 = types<>;
 
diff --git a/test/CXX/temp/temp.res/temp.local/p6.cpp b/test/CXX/temp/temp.res/temp.local/p6.cpp
index 843b45543fcf8..e2aa0ff344291 100644
--- a/test/CXX/temp/temp.res/temp.local/p6.cpp
+++ b/test/CXX/temp/temp.res/temp.local/p6.cpp
@@ -5,11 +5,11 @@ namespace N {}
 template<typename T, // expected-note {{declared here}}
          typename T> struct X {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
-template<typename T> struct Y { // expected-note 17{{declared here}}
+template<typename T> struct Y { // expected-note 18{{declared here}}
   template<typename T> struct A {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
   struct B {
-    template<typename> struct T {}; // FIXME: desired-error {{declaration of 'T' shadows template parameter}}
+    template<typename> struct T {}; // expected-error {{declaration of 'T' shadows template parameter}}
   };
   struct C {
     template<typename> void T(); // expected-error {{declaration of 'T' shadows template parameter}}
@@ -65,11 +65,11 @@ template<typename T> struct Y { // expected-note 17{{declared here}}
   friend struct T; // expected-error {{declaration of 'T' shadows template parameter}}
 };
 
-template<int T> struct Z { // expected-note 15{{declared here}}
+template<int T> struct Z { // expected-note 16{{declared here}}
   template<typename T> struct A {}; // expected-error {{declaration of 'T' shadows template parameter}}
 
   struct B {
-    template<typename> struct T {}; // FIXME: desired-error {{declaration of 'T' shadows template parameter}}
+    template<typename> struct T {}; // expected-error {{declaration of 'T' shadows template parameter}}
   };
   struct C {
     template<typename> void T(); // expected-error {{declaration of 'T' shadows template parameter}}
@@ -129,7 +129,8 @@ void f(int T) {} // expected-error {{declaration of 'T' shadows template paramet
 
 // FIXME: These are ill-formed: a template-parameter shall not have the same name as the template name.
 namespace A {
-  template<typename T> struct T {};
+  template<typename T> struct T {};  // expected-error{{declaration of 'T' shadows template parameter}}
+                                     // expected-note@-1{{template parameter is declared here}}
 }
 namespace B {
   template<typename T> void T() {}
@@ -137,3 +138,13 @@ namespace B {
 namespace C {
   template<typename T> int T;
 }
+
+namespace PR28023 {
+template<int V>  // expected-note{{template parameter is declared here}}
+struct A {
+  struct B {
+    template <int> friend struct V;  // expected-error{{declaration of 'V' shadows template parameter}}
+  };
+};
+A<0>::B a;
+}
diff --git a/test/CXX/temp/temp.spec/no-body.cpp b/test/CXX/temp/temp.spec/no-body.cpp
index 61d285b27ed28..4ec18fdf820f4 100644
--- a/test/CXX/temp/temp.spec/no-body.cpp
+++ b/test/CXX/temp/temp.spec/no-body.cpp
@@ -1,17 +1,44 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 // RUN: cp %s %t
 // RUN: not %clang_cc1 -x c++ -fixit %t -DFIXING
 // RUN: %clang_cc1 -x c++ %t -DFIXING
 
 template<typename T> void f(T) { }
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> void g(T) { }
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> struct x { };
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{explicit instantiation refers here}}
+#endif
+
 template<typename T> struct y { };  // expected-note {{declared here}}
 
-namespace good {
+namespace good { // Only good in C++98/03
+#ifndef FIXING
   template void f<int>(int);
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'f' must occur at global scope}}
+#endif
+
   template void g(int);
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'g' must occur at global scope}}
+#endif
+
   template struct x<int>;
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{explicit instantiation of 'x' must occur at global scope}}
+#endif
+#endif
 }
 
 namespace unsupported {
diff --git a/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp b/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
index 4fbc45a7d7dd5..21399b62046d7 100644
--- a/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
+++ b/test/CXX/temp/temp.spec/temp.expl.spec/p2.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 
 // This test creates cases where implicit instantiations of various entities
 // would cause a diagnostic, but provides expliict specializations for those
@@ -16,7 +19,10 @@ struct NonDefaultConstructible {
 
 //     -- function template
 namespace N0 {
-  template<typename T> void f0(T) { // expected-note{{here}}
+  template<typename T> void f0(T) {
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
     T t;
   }
 
@@ -36,7 +42,11 @@ namespace N1 {
   template<> void N0::f0(long) { } // expected-error{{does not enclose namespace}}
 }
 
-template<> void N0::f0(double); // expected-warning{{C++11 extension}}
+template<> void N0::f0(double);
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of function template specialization of 'f0' outside namespace 'N0' is a C++11 extension}}
+#endif
+
 template<> void N0::f0(double) { }
 
 struct X1 {
@@ -49,21 +59,39 @@ struct X1 {
 namespace N0 {
   
 template<typename T>
-struct X0 { // expected-note 2{{here}}
-  static T member; // expected-note{{here}}
+struct X0 { // expected-note {{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
+  static T member;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
   
-  void f1(T t) { // expected-note{{explicitly specialized declaration is here}}
+  void f1(T t) {
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
     t = 17;
   }
   
-  struct Inner : public T { }; // expected-note 3{{here}}
+  struct Inner : public T { }; // expected-note 2{{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
   
   template<typename U>
-  struct InnerTemplate : public T { }; // expected-note 2{{explicitly specialized}} \
-   // expected-error{{base specifier}}
+  struct InnerTemplate : public T { }; // expected-note {{explicitly specialized declaration is here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
+  // expected-error@-4 {{base specifier must name a class}}
   
   template<typename U>
-  void ft1(T t, U u); // expected-note{{explicitly specialized}}
+  void ft1(T t, U u);
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
 };
 
 }
@@ -76,7 +104,10 @@ void N0::X0<T>::ft1(T t, U u) {
 
 template<typename T> T N0::X0<T>::member;
 
-template<> struct N0::X0<void> { }; // expected-warning{{C++11 extension}}
+template<> struct N0::X0<void> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'X0' outside namespace 'N0' is a C++11 extension}}
+#endif
 N0::X0<void> test_X0;
 
 namespace N1 {
@@ -92,7 +123,10 @@ template<> struct N0::X0<volatile void> {
 };
 
 //     -- member function of a class template
-template<> void N0::X0<void*>::f1(void *) { } // expected-warning{{member function specialization}}
+template<> void N0::X0<void*>::f1(void *) { }
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of member function specialization of 'f1' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 void test_spec(N0::X0<void*> xvp, void *vp) {
   xvp.f1(vp);
@@ -125,7 +159,10 @@ NonDefaultConstructible &get_static_member() {
   return N0::X0<NonDefaultConstructible>::member;
 }
 
-template<> int N0::X0<int>::member;  // expected-warning{{C++11 extension}}
+template<> int N0::X0<int>::member;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of static data member specialization of 'member' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 template<> float N0::X0<float>::member = 3.14f;
 
@@ -153,7 +190,10 @@ namespace N0 {
 }
 
 template<>
-struct N0::X0<long>::Inner { }; // expected-warning{{C++11 extension}}
+struct N0::X0<long>::Inner { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of member class specialization of 'Inner' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 template<>
 struct N0::X0<float>::Inner { };
@@ -192,7 +232,10 @@ template<> template<>
 struct N0::X0<int>::InnerTemplate<long> { }; // okay
 
 template<> template<>
-struct N0::X0<int>::InnerTemplate<float> { }; // expected-warning{{class template specialization}}
+struct N0::X0<int>::InnerTemplate<float> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'InnerTemplate' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 namespace N1 {
   template<> template<>
@@ -224,7 +267,10 @@ template<> template<>
 void N0::X0<void*>::ft1(void *, unsigned) { } // okay
 
 template<> template<>
-void N0::X0<void*>::ft1(void *, float) { } // expected-warning{{function template specialization}}
+void N0::X0<void*>::ft1(void *, float) { }
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of function template specialization of 'ft1' outside namespace 'N0' is a C++11 extension}}
+#endif
 
 namespace N1 {
   template<> template<>
diff --git a/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp b/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
index c8b7def5a4485..d82691c7c8689 100644
--- a/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
+++ b/test/CXX/temp/temp.spec/temp.expl.spec/p3.cpp
@@ -1,14 +1,20 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace N {
-  template<class T> class X; // expected-note {{'N::X' declared here}} \
-                             // expected-note {{explicitly specialized declaration is here}}
+  template<class T> class X; // expected-note {{'N::X' declared here}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized declaration is here}}
+#endif
 }
 
 // TODO: Don't add a namespace qualifier to the template if it would trigger
 // the warning about the specialization being outside of the namespace.
-template<> class X<int> { /* ... */ };	// expected-error {{no template named 'X'; did you mean 'N::X'?}} \
-                                        // expected-warning {{first declaration of class template specialization of 'X' outside namespace 'N' is a C++11 extension}}
+template<> class X<int> { /* ... */ };	// expected-error {{no template named 'X'; did you mean 'N::X'?}}
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'X' outside namespace 'N' is a C++11 extension}}
+#endif
 
 namespace N {
   
diff --git a/test/CXX/temp/temp.spec/temp.explicit/p2.cpp b/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
index 1dfcf0ce2d2f5..027022176f81c 100644
--- a/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
+++ b/test/CXX/temp/temp.spec/temp.explicit/p2.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++98 -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -std=c++11 %s
 
 // Example from the standard
 template<class T> class Array { void mf() { } }; 
@@ -39,5 +41,16 @@ namespace N {
 }
 using namespace N;
 
-template struct X1<int>; // expected-warning{{must occur in}}
-template void f1(int); // expected-warning{{must occur in}}
+template struct X1<int>;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::X1' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::X1' must occur in namespace 'N'}}
+#endif
+
+template void f1(int);
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::f1' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::f1' must occur in namespace 'N'}}
+#endif
diff --git a/test/CXX/temp/temp.spec/temp.explicit/p5.cpp b/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
index 8422c519a760c..ca1f9a391edd0 100644
--- a/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
+++ b/test/CXX/temp/temp.spec/temp.explicit/p5.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 -Wc++11-compat %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace N {
   template<class T> class Y { // expected-note{{explicit instantiation refers here}}
@@ -11,7 +13,12 @@ template class Z<int>; // expected-error{{explicit instantiation of non-template
 // FIXME: This example from the standard is wrong; note posted to CWG reflector
 // on 10/27/2009
 using N::Y; 
-template class Y<int>; // expected-warning{{must occur in}}
+template class Y<int>;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{explicit instantiation of 'N::Y' must occur in namespace 'N'}}
+#else
+// expected-error@-4 {{explicit instantiation of 'N::Y' must occur in namespace 'N'}}
+#endif
 
 template class N::Y<char*>; 
 template void N::Y<double>::mf();
diff --git a/test/CXX/temp/temp.spec/temp.inst/p1.cpp b/test/CXX/temp/temp.spec/temp.inst/p1.cpp
index adf812b714d1a..3d2d6d7c3adf5 100644
--- a/test/CXX/temp/temp.spec/temp.inst/p1.cpp
+++ b/test/CXX/temp/temp.spec/temp.inst/p1.cpp
@@ -54,6 +54,16 @@ namespace ScopedEnum {
   int test2 = g<int>(); // expected-note {{here}}
 }
 
+// - static data members
+namespace StaticDataMembers {
+  template<typename T>
+  struct A {
+    static const int n = T::error; // expected-error {{has no members}}
+    static inline int m = T::error; // expected-warning {{extension}}
+  };
+  A<int> ai; // expected-note {{here}}
+}
+
 // And it cases the implicit instantiations of the definitions of:
 
 // - unscoped member enumerations
diff --git a/test/CodeCompletion/Inputs/ModuleA/module.modulemap b/test/CodeCompletion/Inputs/ModuleA/module.modulemap
new file mode 100644
index 0000000000000..b0fe1faaee6a3
--- /dev/null
+++ b/test/CodeCompletion/Inputs/ModuleA/module.modulemap
@@ -0,0 +1,4 @@
+module ModuleA {
+  header "moduleA.h"
+  export *
+}
diff --git a/test/CodeCompletion/Inputs/ModuleA/moduleA.h b/test/CodeCompletion/Inputs/ModuleA/moduleA.h
new file mode 100644
index 0000000000000..f90f56dad8259
--- /dev/null
+++ b/test/CodeCompletion/Inputs/ModuleA/moduleA.h
@@ -0,0 +1 @@
+static int const FROM_MODULE_A = 0;
diff --git a/test/CodeCompletion/Inputs/import_moduleA.h b/test/CodeCompletion/Inputs/import_moduleA.h
new file mode 100644
index 0000000000000..e2663f4e61b53
--- /dev/null
+++ b/test/CodeCompletion/Inputs/import_moduleA.h
@@ -0,0 +1,2 @@
+#include "ModuleA/moduleA.h"
+static int const FROM_HEADER = 1;
diff --git a/test/CodeCompletion/Inputs/reserved.h b/test/CodeCompletion/Inputs/reserved.h
index fafe4ac440e00..7b353a58fca4e 100644
--- a/test/CodeCompletion/Inputs/reserved.h
+++ b/test/CodeCompletion/Inputs/reserved.h
@@ -1,2 +1,4 @@
-typedef int _INTEGER_TYPE;
+typedef int __INTEGER_TYPE;
 typedef float FLOATING_TYPE;
+
+typedef int _MyPrivateType;
diff --git a/test/CodeCompletion/bracket-decl.c b/test/CodeCompletion/bracket-decl.c
new file mode 100644
index 0000000000000..cf80b424d13b2
--- /dev/null
+++ b/test/CodeCompletion/bracket-decl.c
@@ -0,0 +1,9 @@
+#define PATHSIZE 256
+
+static const int len = 1234;
+
+void foo() {
+  char arr[
+// RUN: %clang_cc1 -fsyntax-only -code-completion-macros -code-completion-at=%s:6:12 %s -o - | FileCheck %s
+// CHECK: COMPLETION: len
+// CHECK: COMPLETION: PATHSIZE
diff --git a/test/CodeCompletion/ctor-initializer.cpp b/test/CodeCompletion/ctor-initializer.cpp
new file mode 100644
index 0000000000000..00af64dd4fa46
--- /dev/null
+++ b/test/CodeCompletion/ctor-initializer.cpp
@@ -0,0 +1,41 @@
+struct Base1 {
+  Base1() : {}
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:2:12 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+  // CHECK-CC1: COMPLETION: Pattern : member1(<#args#>)
+  // CHECK-CC1: COMPLETION: Pattern : member2(<#args#>
+
+  Base1(int) : member1(123), {}
+  // RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:7:30 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+  // CHECK-CC2-NOT: COMPLETION: Pattern : member1(<#args#>)
+  // CHECK-CC2: COMPLETION: Pattern : member2(<#args#>
+
+  int member1;
+  float member2;
+};
+
+struct Derived : public Base1 {
+  Derived();
+  Derived(int);
+  Derived(float);
+  int deriv1;
+};
+
+Derived::Derived() : {}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:23:22 %s -o - | FileCheck -check-prefix=CHECK-CC3 %s
+// CHECK-CC3: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC3: COMPLETION: Pattern : deriv1(<#args#>)
+
+Derived::Derived(int) try : {
+} catch (...) {
+}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:28:29 %s -o - | FileCheck -check-prefix=CHECK-CC4 %s
+// CHECK-CC4: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC4: COMPLETION: Pattern : deriv1(<#args#>)
+
+Derived::Derived(float) try : Base1(),
+{
+} catch (...) {
+}
+// RUN: %clang_cc1 -fsyntax-only -code-completion-at=%s:35:39 %s -o - | FileCheck -check-prefix=CHECK-CC5 %s
+// CHECK-CC5-NOT: COMPLETION: Pattern : Base1(<#args#>)
+// CHECK-CC5: COMPLETION: Pattern : deriv1(<#args#>)
diff --git a/test/CodeCompletion/documentation.m b/test/CodeCompletion/documentation.m
new file mode 100644
index 0000000000000..47add5b6ca0d5
--- /dev/null
+++ b/test/CodeCompletion/documentation.m
@@ -0,0 +1,25 @@
+// Note: the run lines follow their respective tests, since line/column
+// matter in this test.
+
+@interface Base
+@end
+
+@interface Test : Base
+/// Instance!
+@property id instanceProp;
+/// Class!
+@property (class) id classProp;
+@end
+
+void test(Test *obj) {
+  [obj instanceProp];
+  [Test classProp];
+}
+
+// RUN: %clang_cc1 -fsyntax-only -code-completion-brief-comments -code-completion-at=%s:15:8 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+// CHECK-CC1: instanceProp : [#id#]instanceProp : Instance!
+// CHECK-CC1: setInstanceProp: : [#void#]setInstanceProp:<#(id)#> : Instance!
+
+// RUN: %clang_cc1 -fsyntax-only -code-completion-brief-comments -code-completion-at=%s:16:9 %s -o - | FileCheck -check-prefix=CHECK-CC2 %s
+// CHECK-CC2: classProp : [#id#]classProp : Class!
+// CHECK-CC2: setClassProp: : [#void#]setClassProp:<#(id)#> : Class!
diff --git a/test/CodeCompletion/ordinary-name.c b/test/CodeCompletion/ordinary-name.c
index dda7bb018a0f9..1352b70730d80 100644
--- a/test/CodeCompletion/ordinary-name.c
+++ b/test/CodeCompletion/ordinary-name.c
@@ -5,8 +5,9 @@ typedef struct t _TYPEDEF;
 void foo() {
   int y;
   // RUN: %clang_cc1 -isystem %S/Inputs -fsyntax-only -code-completion-at=%s:6:9 %s -o - | FileCheck -check-prefix=CHECK-CC1 %s
+  // CHECK-CC1-NOT: __INTEGER_TYPE
   // CHECK-CC1: _Imaginary
-  // CHECK-CC1-NOT: _INTEGER_TYPE;
+  // CHECK-CC1: _MyPrivateType
   // CHECK-CC1: _TYPEDEF
   // CHECK-CC1: FLOATING_TYPE
   // CHECK-CC1: foo
diff --git a/test/CodeCompletion/pch-and-module.m b/test/CodeCompletion/pch-and-module.m
new file mode 100644
index 0000000000000..8361448c3a7eb
--- /dev/null
+++ b/test/CodeCompletion/pch-and-module.m
@@ -0,0 +1,37 @@
+#import "import_moduleA.h"
+static const int FROM_IMPL = 2;
+
+void test0(void) {
+  int x = 
+}
+// The lines above this point are sensitive to line/column changes.
+
+// ===--- None
+// RUN: c-index-test -code-completion-at=%s:5:11 %s -I %S/Inputs | FileCheck %s
+
+// ===--- Modules
+// RUN: rm -rf %t && mkdir %t
+// RUN: c-index-test -code-completion-at=%s:5:11 %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/mcp | FileCheck %s
+
+// ===--- PCH
+// RUN: rm -rf %t && mkdir %t
+// RUN: c-index-test -write-pch %t/import_moduleA.pch -x objective-c-header %S/Inputs/import_moduleA.h -I %S/Inputs
+// RUN: c-index-test -code-completion-at=%s:5:11 %s -include-pch %t/import_moduleA.pch -I %S/Inputs | FileCheck %s
+
+// ===--- PCH + Modules
+// RUN: rm -rf %t && mkdir %t
+// RUN: c-index-test -write-pch %t/import_moduleA.pch -x objective-c-header %S/Inputs/import_moduleA.h -fmodules -fmodules-cache-path=%t/mcp -I %S/Inputs
+// RUN: c-index-test -code-completion-at=%s:5:11 %s -include-pch %t/import_moduleA.pch -I %S/Inputs -fmodules -fmodules-cache-path=%t/mcp | FileCheck %s
+
+// ===--- Preamble
+// RUN: rm -rf %t && mkdir %t
+// RUN: env CINDEXTEST_EDITING=1 c-index-test -code-completion-at=%s:5:11 %s -I %S/Inputs | FileCheck %s
+
+// ===--- Preamble + Modules
+// RUN: rm -rf %t
+// RUN: env CINDEXTEST_EDITING=1 c-index-test -code-completion-at=%s:5:11 %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/mcp | FileCheck %s
+
+
+// CHECK: FROM_HEADER
+// CHECK: FROM_IMPL
+// CHECK: FROM_MODULE_A
diff --git a/test/CodeGen/3dnow-builtins.c b/test/CodeGen/3dnow-builtins.c
index d534349a74757..fdee2a7619a05 100644
--- a/test/CodeGen/3dnow-builtins.c
+++ b/test/CodeGen/3dnow-builtins.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-unknown-unknown -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s -check-prefix=GCC -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-scei-ps4 -target-feature +3dnowa -emit-llvm -o - -Werror | FileCheck %s -check-prefix=PS4 -check-prefix=CHECK
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
@@ -6,151 +7,176 @@
 #include <x86intrin.h>
 
 __m64 test_m_pavgusb(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pavgusb
+  // PS4-LABEL: define i64 @test_m_pavgusb
+  // GCC-LABEL: define double @test_m_pavgusb
   // CHECK: @llvm.x86.3dnow.pavgusb
   return _m_pavgusb(m1, m2);
 }
 
 __m64 test_m_pf2id(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pf2id
+  // PS4-LABEL: define i64 @test_m_pf2id
+  // GCC-LABEL: define double @test_m_pf2id
   // CHECK: @llvm.x86.3dnow.pf2id
   return _m_pf2id(m);
 }
 
 __m64 test_m_pfacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfacc
+  // PS4-LABEL: define i64 @test_m_pfacc
+  // GCC-LABEL: define double @test_m_pfacc
   // CHECK: @llvm.x86.3dnow.pfacc
   return _m_pfacc(m1, m2);
 }
 
 __m64 test_m_pfadd(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfadd
+  // PS4-LABEL: define i64 @test_m_pfadd
+  // GCC-LABEL: define double @test_m_pfadd
   // CHECK: @llvm.x86.3dnow.pfadd
   return _m_pfadd(m1, m2);
 }
 
 __m64 test_m_pfcmpeq(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpeq
+  // PS4-LABEL: define i64 @test_m_pfcmpeq
+  // GCC-LABEL: define double @test_m_pfcmpeq
   // CHECK: @llvm.x86.3dnow.pfcmpeq
   return _m_pfcmpeq(m1, m2);
 }
 
 __m64 test_m_pfcmpge(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpge
+  // PS4-LABEL: define i64 @test_m_pfcmpge
+  // GCC-LABEL: define double @test_m_pfcmpge
   // CHECK: @llvm.x86.3dnow.pfcmpge
   return _m_pfcmpge(m1, m2);
 }
 
 __m64 test_m_pfcmpgt(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfcmpgt
+  // PS4-LABEL: define i64 @test_m_pfcmpgt
+  // GCC-LABEL: define double @test_m_pfcmpgt
   // CHECK: @llvm.x86.3dnow.pfcmpgt
   return _m_pfcmpgt(m1, m2);
 }
 
 __m64 test_m_pfmax(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmax
+  // PS4-LABEL: define i64 @test_m_pfmax
+  // GCC-LABEL: define double @test_m_pfmax
   // CHECK: @llvm.x86.3dnow.pfmax
   return _m_pfmax(m1, m2);
 }
 
 __m64 test_m_pfmin(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmin
+  // PS4-LABEL: define i64 @test_m_pfmin
+  // GCC-LABEL: define double @test_m_pfmin
   // CHECK: @llvm.x86.3dnow.pfmin
   return _m_pfmin(m1, m2);
 }
 
 __m64 test_m_pfmul(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfmul
+  // PS4-LABEL: define i64 @test_m_pfmul
+  // GCC-LABEL: define double @test_m_pfmul
   // CHECK: @llvm.x86.3dnow.pfmul
   return _m_pfmul(m1, m2);
 }
 
 __m64 test_m_pfrcp(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pfrcp
+  // PS4-LABEL: define i64 @test_m_pfrcp
+  // GCC-LABEL: define double @test_m_pfrcp
   // CHECK: @llvm.x86.3dnow.pfrcp
   return _m_pfrcp(m);
 }
 
 __m64 test_m_pfrcpit1(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrcpit1
+  // PS4-LABEL: define i64 @test_m_pfrcpit1
+  // GCC-LABEL: define double @test_m_pfrcpit1
   // CHECK: @llvm.x86.3dnow.pfrcpit1
   return _m_pfrcpit1(m1, m2);
 }
 
 __m64 test_m_pfrcpit2(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrcpit2
+  // PS4-LABEL: define i64 @test_m_pfrcpit2
+  // GCC-LABEL: define double @test_m_pfrcpit2
   // CHECK: @llvm.x86.3dnow.pfrcpit2
   return _m_pfrcpit2(m1, m2);
 }
 
 __m64 test_m_pfrsqrt(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pfrsqrt
+  // PS4-LABEL: define i64 @test_m_pfrsqrt
+  // GCC-LABEL: define double @test_m_pfrsqrt
   // CHECK: @llvm.x86.3dnow.pfrsqrt
   return _m_pfrsqrt(m);
 }
 
 __m64 test_m_pfrsqrtit1(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfrsqrtit1
+  // PS4-LABEL: define i64 @test_m_pfrsqrtit1
+  // GCC-LABEL: define double @test_m_pfrsqrtit1
   // CHECK: @llvm.x86.3dnow.pfrsqit1
   return _m_pfrsqrtit1(m1, m2);
 }
 
 __m64 test_m_pfsub(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfsub
+  // PS4-LABEL: define i64 @test_m_pfsub
+  // GCC-LABEL: define double @test_m_pfsub
   // CHECK: @llvm.x86.3dnow.pfsub
   return _m_pfsub(m1, m2);
 }
 
 __m64 test_m_pfsubr(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfsubr
+  // PS4-LABEL: define i64 @test_m_pfsubr
+  // GCC-LABEL: define double @test_m_pfsubr
   // CHECK: @llvm.x86.3dnow.pfsubr
   return _m_pfsubr(m1, m2);
 }
 
 __m64 test_m_pi2fd(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pi2fd
+  // PS4-LABEL: define i64 @test_m_pi2fd
+  // GCC-LABEL: define double @test_m_pi2fd
   // CHECK: @llvm.x86.3dnow.pi2fd
   return _m_pi2fd(m);
 }
 
 __m64 test_m_pmulhrw(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pmulhrw
+  // PS4-LABEL: define i64 @test_m_pmulhrw
+  // GCC-LABEL: define double @test_m_pmulhrw
   // CHECK: @llvm.x86.3dnow.pmulhrw
   return _m_pmulhrw(m1, m2);
 }
 
 __m64 test_m_pf2iw(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pf2iw
+  // PS4-LABEL: define i64 @test_m_pf2iw
+  // GCC-LABEL: define double @test_m_pf2iw
   // CHECK: @llvm.x86.3dnowa.pf2iw
   return _m_pf2iw(m);
 }
 
 __m64 test_m_pfnacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfnacc
+  // PS4-LABEL: define i64 @test_m_pfnacc
+  // GCC-LABEL: define double @test_m_pfnacc
   // CHECK: @llvm.x86.3dnowa.pfnacc
   return _m_pfnacc(m1, m2);
 }
 
 __m64 test_m_pfpnacc(__m64 m1, __m64 m2) {
-  // CHECK-LABEL: define i64 @test_m_pfpnacc
+  // PS4-LABEL: define i64 @test_m_pfpnacc
+  // GCC-LABEL: define double @test_m_pfpnacc
   // CHECK: @llvm.x86.3dnowa.pfpnacc
   return _m_pfpnacc(m1, m2);
 }
 
 __m64 test_m_pi2fw(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pi2fw
+  // PS4-LABEL: define i64 @test_m_pi2fw
+  // GCC-LABEL: define double @test_m_pi2fw
   // CHECK: @llvm.x86.3dnowa.pi2fw
   return _m_pi2fw(m);
 }
 
 __m64 test_m_pswapdsf(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pswapdsf
+  // PS4-LABEL: define i64 @test_m_pswapdsf
+  // GCC-LABEL: define double @test_m_pswapdsf
   // CHECK: @llvm.x86.3dnowa.pswapd
   return _m_pswapdsf(m);
 }
 
 __m64 test_m_pswapdsi(__m64 m) {
-  // CHECK-LABEL: define i64 @test_m_pswapdsi
+  // PS4-LABEL: define i64 @test_m_pswapdsi
+  // GCC-LABEL: define double @test_m_pswapdsi
   // CHECK: @llvm.x86.3dnowa.pswapd
   return _m_pswapdsi(m);
 }
diff --git a/test/CodeGen/CFStrings.c b/test/CodeGen/CFStrings.c
new file mode 100644
index 0000000000000..17de39c6aed8d
--- /dev/null
+++ b/test/CodeGen/CFStrings.c
@@ -0,0 +1,51 @@
+// REQUIRES: arm-registered-target,x86-registered-target
+
+// RUN: %clang_cc1 -triple thumbv7-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple i686-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+// RUN: %clang_cc1 -triple x86_64-windows -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-COFF
+
+// RUN: %clang_cc1 -triple armv7-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple i686-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF32
+// RUN: %clang_cc1 -triple x86_64-elf -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-ELF -check-prefix CHECK-ELF64
+// RUN: %clang_cc1 -triple armv7-elf -S %s -o - | FileCheck %s -check-prefix CHECK-ELF-DATA-SECTION
+
+// RUN: %clang_cc1 -triple armv7-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple i386-apple-macosx -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO32
+// RUN: %clang_cc1 -triple x86_64-macho -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-MACHO -check-prefix CHECK-MACHO64
+
+typedef struct __CFString *CFStringRef;
+const CFStringRef one = (CFStringRef)__builtin___CFStringMakeConstantString("one");
+const CFStringRef two = (CFStringRef)__builtin___CFStringMakeConstantString("\xef\xbf\xbd\x74\xef\xbf\xbd\x77\xef\xbf\xbd\x6f");
+
+// CHECK-COFF: @.str = private unnamed_addr constant [4 x i8] c"one\00", align 1
+// CHECK-ELF: @.str = private unnamed_addr constant [4 x i8] c"one\00", align 1
+// CHECK-MACHO: @.str = private unnamed_addr constant [4 x i8] c"one\00", section "__TEXT,__cstring,cstring_literals", align 1
+
+// CHECK-COFF: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i32 3 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i32 0, i32 0), i64 3 }, section "__DATA,__cfstring", align 8
+
+// CHECK-COFF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
+// CHECK-ELF: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], align 2
+// CHECK-MACHO: @.str.1 = private unnamed_addr constant [7 x i16] [i16 -3, i16 116, i16 -3, i16 119, i16 -3, i16 111, i16 0], section "__TEXT,__ustring", align 2
+
+// CHECK-COFF: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align {{[48]}}
+// CHECK-ELF32: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "cfstring", align 4
+// CHECK-ELF64: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "cfstring", align 8
+// CHECK-MACHO32: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i32 6 }, section "__DATA,__cfstring", align 4
+// CHECK-MACHO64: @_unnamed_cfstring_.2 = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([7 x i16]* @.str.1 to i8*), i64 6 }, section "__DATA,__cfstring", align 8
+
+// CHECK-ELF-DATA-SECTION: .section .rodata.str1.1
+// CHECK-ELF-DATA-SECTION: .asciz "one"
+
+// CHECK-ELF-DATA-SECTION: .section .rodata.str2.2
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 116
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 119
+// CHECK-ELF-DATA-SECTION: .short 65533
+// CHECK-ELF-DATA-SECTION: .short 111
+// CHECK-ELF-DATA-SECTION: .short 0
+
diff --git a/test/CodeGen/Inputs/pgo-sample.prof b/test/CodeGen/Inputs/pgo-sample.prof
new file mode 100644
index 0000000000000..c5b8d9ef1a893
--- /dev/null
+++ b/test/CodeGen/Inputs/pgo-sample.prof
@@ -0,0 +1,2 @@
+bar:100:100
+ 1: 2000
diff --git a/test/CodeGen/Inputs/pgotestclang.profraw b/test/CodeGen/Inputs/pgotestclang.profraw
new file mode 100644
index 0000000000000..401ba073493c5
--- /dev/null
+++ b/test/CodeGen/Inputs/pgotestclang.profraw
@@ -0,0 +1 @@
+:fe
diff --git a/test/CodeGen/Inputs/pgotestir.profraw b/test/CodeGen/Inputs/pgotestir.profraw
new file mode 100644
index 0000000000000..04a7c1c1a35aa
--- /dev/null
+++ b/test/CodeGen/Inputs/pgotestir.profraw
@@ -0,0 +1 @@
+:ir
diff --git a/test/CodeGen/Inputs/thinlto_backend.ll b/test/CodeGen/Inputs/thinlto_backend.ll
new file mode 100644
index 0000000000000..78678c0c5add2
--- /dev/null
+++ b/test/CodeGen/Inputs/thinlto_backend.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f2() {
+  ret void
+}
diff --git a/test/CodeGen/aarch64-arguments-hfa-v3.c b/test/CodeGen/aarch64-arguments-hfa-v3.c
new file mode 100644
index 0000000000000..59fa5e959ed2f
--- /dev/null
+++ b/test/CodeGen/aarch64-arguments-hfa-v3.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple arm64-apple-ios -target-feature +neon -target-abi darwinpcs -fallow-half-arguments-and-returns -emit-llvm -o - %s | FileCheck %s
+
+typedef __attribute__((__ext_vector_type__(16))) signed char int8x16_t;
+typedef __attribute__((__ext_vector_type__(3))) float float32x3_t;
+
+// CHECK: %struct.HFAv3 = type { [4 x <3 x float>] }
+typedef struct { float32x3_t arr[4]; } HFAv3;
+
+// CHECK: %struct.MixedHFAv3 = type { [3 x <3 x float>], <16 x i8> }
+typedef struct { float32x3_t arr[3]; int8x16_t b; } MixedHFAv3;
+
+// CHECK: define %struct.HFAv3 @test([4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}})
+HFAv3 test(HFAv3 a0, HFAv3 a1, HFAv3 a2) {
+  return a2;
+}
+
+// CHECK: define %struct.MixedHFAv3 @test_mixed([4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}}, [4 x <4 x float>] %{{.*}})
+MixedHFAv3 test_mixed(MixedHFAv3 a0, MixedHFAv3 a1, MixedHFAv3 a2) {
+  return a2;
+}
diff --git a/test/CodeGen/aarch64-fix-cortex-a53-835769.c b/test/CodeGen/aarch64-fix-cortex-a53-835769.c
index 7ad124012e6de..c6a38b20074ac 100644
--- a/test/CodeGen/aarch64-fix-cortex-a53-835769.c
+++ b/test/CodeGen/aarch64-fix-cortex-a53-835769.c
@@ -23,5 +23,5 @@ int64_t f_load_madd_64(int64_t a, int64_t b, int64_t *c) {
 
 // CHECK: ldr
 // CHECK-YES-NEXT: nop
-// CHECK-NO-NEXT-NOT: nop
+// CHECK-NO-NOT: nop
 // CHECK-NEXT: madd
diff --git a/test/CodeGen/aarch64-neon-2velem.c b/test/CodeGen/aarch64-neon-2velem.c
index fa910ff78251d..36500f62a5d9b 100644
--- a/test/CodeGen/aarch64-neon-2velem.c
+++ b/test/CodeGen/aarch64-neon-2velem.c
@@ -1,2452 +1,5011 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16
   return vmla_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16
   return vmlaq_lane_s16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32
   return vmla_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32
   return vmlaq_lane_s32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16
   return vmla_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16
   return vmlaq_laneq_s16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32
   return vmla_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32
   return vmlaq_laneq_s32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16
   return vmls_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16
   return vmlsq_lane_s16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32
   return vmls_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32
   return vmlsq_lane_s32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16
   return vmls_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16
   return vmlsq_laneq_s16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32
   return vmls_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32
   return vmlsq_laneq_s32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16
   return vmul_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16
   return vmulq_lane_s16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32
   return vmul_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32
   return vmulq_lane_s32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16
   return vmul_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16
   return vmulq_lane_u16(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32
   return vmul_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32
   return vmulq_lane_u32(a, v, 1);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16
   return vmul_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16
   return vmulq_laneq_s16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32
   return vmul_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32
   return vmulq_laneq_s32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16
   return vmul_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16
   return vmulq_laneq_u16(a, v, 7);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32
   return vmul_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32
   return vmulq_laneq_u32(a, v, 3);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfma_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32
   return vfma_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32
   return vfmaq_lane_f32(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32
   return vfma_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32
   return vfmaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfms_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32
   return vfms_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32
   return vfmsq_lane_f32(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32
   return vfms_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32
   return vfmsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK:   ret <2 x double> [[FMLA2]]
 float64x2_t test_vfmaq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f64
   return vfmaq_lane_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64
   return vfmaq_laneq_f64(a, b, v, 1);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_lane_f64(<2 x double> %a, <2 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[FMLA2:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[FMLA]], <2 x double> [[LANE]], <2 x double> [[FMLA1]])
+// CHECK:   ret <2 x double> [[FMLA2]]
 float64x2_t test_vfmsq_lane_f64(float64x2_t a, float64x2_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f64
   return vfmsq_lane_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64
   return vfmsq_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define float @test_vfmas_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmas_laneq_f32
   return vfmas_laneq_f32(a, b, v, 3);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_lane_f64(double %a, double %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_lane_f64(float64_t a, float64_t b, float64x1_t v) {
-  // CHECK-LABEL: test_vfmsd_lane_f64
   return vfmsd_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls d[0-9]+, d[0-9]+, v[0-9]+\.d\[0\]|fmsub d[0-9]+, d[0-9]+, d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vfmss_laneq_f32(float %a, float %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_laneq_f32(float32_t a, float32_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmss_laneq_f32
   return vfmss_laneq_f32(a, b, v, 3);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vfmsd_laneq_f64(double %a, double %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub double -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double [[SUB]], double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsd_laneq_f64
   return vfmsd_laneq_f64(a, b, v, 1);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16
   return vmlal_lane_s16(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32
   return vmlal_lane_s32(a, b, v, 1);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16
   return vmlal_laneq_s16(a, b, v, 7);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32
   return vmlal_laneq_s32(a, b, v, 3);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16
   return vmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32
   return vmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16
   return vmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32
   return vmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16
   return vmlsl_lane_s16(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32
   return vmlsl_lane_s32(a, b, v, 1);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16
   return vmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32
   return vmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16
   return vmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32
   return vmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16
   return vmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32
   return vmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16
   return vmlal_lane_u16(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32
   return vmlal_lane_u32(a, b, v, 1);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16
   return vmlal_laneq_u16(a, b, v, 7);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32
   return vmlal_laneq_u32(a, b, v, 3);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16
   return vmlal_high_lane_u16(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32
   return vmlal_high_lane_u32(a, b, v, 1);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16
   return vmlal_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32
   return vmlal_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16
   return vmlsl_lane_u16(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32
   return vmlsl_lane_u32(a, b, v, 1);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16
   return vmlsl_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32
   return vmlsl_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16
   return vmlsl_high_lane_u16(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32
   return vmlsl_high_lane_u32(a, b, v, 1);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16
   return vmlsl_high_laneq_u16(a, b, v, 7);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32
   return vmlsl_high_laneq_u32(a, b, v, 3);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16
   return vmull_lane_s16(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32
   return vmull_lane_s32(a, v, 1);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16
   return vmull_lane_u16(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32
   return vmull_lane_u32(a, v, 1);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16
   return vmull_high_lane_s16(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32
   return vmull_high_lane_s32(a, v, 1);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16
   return vmull_high_lane_u16(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32
   return vmull_high_lane_u32(a, v, 1);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16
   return vmull_laneq_s16(a, v, 7);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32
   return vmull_laneq_s32(a, v, 3);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16
   return vmull_laneq_u16(a, v, 7);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32
   return vmull_laneq_u32(a, v, 3);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16
   return vmull_high_laneq_s16(a, v, 7);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32
   return vmull_high_laneq_s32(a, v, 3);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16
   return vmull_high_laneq_u16(a, v, 7);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32
   return vmull_high_laneq_u32(a, v, 3);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16
   return vqdmlal_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32
   return vqdmlal_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16
   return vqdmlal_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32
   return vqdmlal_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16
   return vqdmlsl_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32
   return vqdmlsl_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16
   return vqdmlsl_high_lane_s16(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32
   return vqdmlsl_high_lane_s32(a, b, v, 1);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16
   return vqdmull_lane_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32
   return vqdmull_lane_s32(a, v, 1);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16
   return vqdmull_laneq_s16(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32
   return vqdmull_laneq_s32(a, v, 3);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16
   return vqdmull_high_lane_s16(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32
   return vqdmull_high_lane_s32(a, v, 1);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16
   return vqdmull_high_laneq_s16(a, v, 7);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32
   return vqdmull_high_laneq_s32(a, v, 3);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16
   return vqdmulh_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16
   return vqdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32
   return vqdmulh_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32
   return vqdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16
   return vqrdmulh_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16
   return vqrdmulhq_lane_s16(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32
   return vqrdmulh_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32
   return vqrdmulhq_lane_s32(a, v, 1);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32
   return vmul_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
 
+// CHECK-LABEL: define <1 x double> @test_vmul_lane_f64(<1 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmul_lane_f64
   return vmul_lane_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+\.d\[0\]|d[0-9]+}}
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32
   return vmulq_lane_f32(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f64
   return vmulq_lane_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32
   return vmul_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64
   return vmul_laneq_f64(a, v, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32
   return vmulq_laneq_f32(a, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64
   return vmulq_laneq_f64(a, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32
   return vmulx_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32
   return vmulxq_lane_f32(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32
   return vmulx_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32
   return vmulxq_laneq_f32(a, v, 3);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64
   return vmulxq_laneq_f64(a, v, 1);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_s16_0
   return vmla_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s16_0
   return vmlaq_lane_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_s32_0
   return vmla_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_s32_0
   return vmlaq_lane_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s16_0
   return vmla_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s16_0
   return vmlaq_laneq_s16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_s32_0
   return vmla_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_s32_0
   return vmlaq_laneq_s32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_s16_0
   return vmls_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s16_0
   return vmlsq_lane_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_s32_0
   return vmls_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_s32_0
   return vmlsq_lane_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s16_0
   return vmls_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_s16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s16_0
   return vmlsq_laneq_s16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_s32_0
   return vmls_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_s32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_s32_0
   return vmlsq_laneq_s32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_s16_0
   return vmul_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s16_0
   return vmulq_lane_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_s32_0
   return vmul_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_s32_0
   return vmulq_lane_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmul_lane_u16_0
   return vmul_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u16_0
   return vmulq_lane_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_u32_0
   return vmul_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_u32_0
   return vmulq_lane_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s16_0
   return vmul_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s16_0
   return vmulq_laneq_s16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_s32_0
   return vmul_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_s32_0
   return vmulq_laneq_s32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u16_0
   return vmul_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u16_0
   return vmulq_laneq_u16(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_u32_0
   return vmul_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_u32_0
   return vmulq_laneq_u32(a, v, 0);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfma_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfma_lane_f32_0
   return vfma_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmaq_lane_f32_0
   return vfmaq_lane_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfma_laneq_f32_0
   return vfma_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f32_0
   return vfmaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[FMLA2:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[FMLA]], <2 x float> [[LANE]], <2 x float> [[FMLA1]])
+// CHECK:   ret <2 x float> [[FMLA2]]
 float32x2_t test_vfms_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfms_lane_f32_0
   return vfms_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[FMLA1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[FMLA2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[FMLA]], <4 x float> [[LANE]], <4 x float> [[FMLA1]])
+// CHECK:   ret <4 x float> [[FMLA2]]
 float32x4_t test_vfmsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vfmsq_lane_f32_0
   return vfmsq_lane_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[LANE]], <2 x float> [[TMP4]], <2 x float> [[TMP3]])
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfms_laneq_f32_0
   return vfms_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP5]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[LANE]], <4 x float> [[TMP4]], <4 x float> [[TMP3]])
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f32_0
   return vfmsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmaq_laneq_f64_0
   return vfmaq_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_laneq_f64_0(<2 x double> %a, <2 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP5]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[LANE]], <2 x double> [[TMP4]], <2 x double> [[TMP3]])
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) {
-  // CHECK-LABEL: test_vfmsq_laneq_f64_0
   return vfmsq_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s16_0
   return vmlal_lane_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_s32_0
   return vmlal_lane_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s16_0
   return vmlal_laneq_s16(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_s32_0
   return vmlal_laneq_s32(a, b, v, 0);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s16_0
   return vmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_s32_0
   return vmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s16_0
   return vmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_s32_0
   return vmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s16_0
   return vmlsl_lane_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_s32_0
   return vmlsl_lane_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s16_0
   return vmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_s32_0
   return vmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s16_0
   return vmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_s32_0
   return vmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s16_0
   return vmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_s32_0
   return vmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u16_0
   return vmlal_lane_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_lane_u32_0
   return vmlal_lane_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u16_0
   return vmlal_laneq_u16(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_laneq_u32_0
   return vmlal_laneq_u32(a, b, v, 0);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u16_0
   return vmlal_high_lane_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlal_high_lane_u32_0
   return vmlal_high_lane_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u16_0
   return vmlal_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlal_high_laneq_u32_0
   return vmlal_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u16_0
   return vmlsl_lane_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_lane_u32_0
   return vmlsl_lane_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u16_0
   return vmlsl_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_laneq_u32_0
   return vmlsl_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_lane_u16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u16_0
   return vmlsl_high_lane_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_lane_u32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vmlsl_high_lane_u32_0
   return vmlsl_high_lane_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u16_0
   return vmlsl_high_laneq_u16(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vmlsl_high_laneq_u32_0
   return vmlsl_high_laneq_u32(a, b, v, 0);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_s16_0
   return vmull_lane_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_s32_0
   return vmull_lane_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_lane_u16_0
   return vmull_lane_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_lane_u32_0
   return vmull_lane_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s16_0
   return vmull_high_lane_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_s32_0
   return vmull_high_lane_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_lane_u16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u16_0
   return vmull_high_lane_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_lane_u32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) {
-  // CHECK-LABEL: test_vmull_high_lane_u32_0
   return vmull_high_lane_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s16_0
   return vmull_laneq_s16(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_s32_0
   return vmull_laneq_s32(a, v, 0);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u16_0
   return vmull_laneq_u16(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_laneq_u32_0
   return vmull_laneq_u32(a, v, 0);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s16_0
   return vmull_high_laneq_s16(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_s32_0
   return vmull_high_laneq_s32(a, v, 0);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u16_0
   return vmull_high_laneq_u16(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) {
-  // CHECK-LABEL: test_vmull_high_laneq_u32_0
   return vmull_high_laneq_u32(a, v, 0);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s16_0
   return vqdmlal_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_lane_s32_0
   return vqdmlal_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s16_0
   return vqdmlal_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_lane_s32_0
   return vqdmlal_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16_0(<4 x i32> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s16_0
   return vqdmlsl_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32_0(<2 x i64> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_lane_s32_0
   return vqdmlsl_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_lane_s16_0(<4 x i32> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s16_0
   return vqdmlsl_high_lane_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_lane_s32_0(<2 x i64> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_lane_s32_0
   return vqdmlsl_high_lane_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s16_0
   return vqdmull_lane_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_lane_s32_0
   return vqdmull_lane_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s16_0
   return vqdmull_laneq_s16(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_laneq_s32_0
   return vqdmull_laneq_s32(a, v, 0);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s16_0
   return vqdmull_high_lane_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmull_high_lane_s32_0
   return vqdmull_high_lane_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s16_0
   return vqdmull_high_laneq_s16(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #2
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmull_high_laneq_s32_0
   return vqdmull_high_laneq_s32(a, v, 0);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s16_0
   return vqdmulh_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s16_0
   return vqdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulh_lane_s32_0
   return vqdmulh_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqdmulhq_lane_s32_0
   return vqdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16_0(<4 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16_0(int16x4_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s16_0
   return vqrdmulh_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16_0(<8 x i16> %a, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16_0(int16x8_t a, int16x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s16_0
   return vqrdmulhq_lane_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32_0(<2 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32_0(int32x2_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulh_lane_s32_0
   return vqrdmulh_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32_0(<4 x i32> %a, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_lane_s32_0
   return vqrdmulhq_lane_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmul_lane_f32_0
   return vmul_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulq_lane_f32_0
   return vmulq_lane_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f32_0
   return vmul_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_laneq_f64_0(<1 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = fmul double [[TMP2]], [[EXTRACT]]
+// CHECK:   [[TMP5:%.*]] = bitcast double [[TMP4]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP5]]
 float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmul_laneq_f64_0
   return vmul_laneq_f64(a, v, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f32_0
   return vmulq_laneq_f32(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x double> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x double> [[MUL]]
 float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulq_laneq_f64_0
   return vmulq_laneq_f64(a, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_lane_f32_0(<2 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulx_lane_f32_0
   return vmulx_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_lane_f32_0(<4 x float> %a, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f32_0
   return vmulxq_lane_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_lane_f64_0(<2 x double> %a, <1 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %v, <1 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) {
-  // CHECK-LABEL: test_vmulxq_lane_f64_0
   return vmulxq_lane_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulx_laneq_f32_0
   return vmulx_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_laneq_f32_0(<4 x float> %a, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #2
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f32_0
   return vmulxq_laneq_f32(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_laneq_f64_0(<2 x double> %a, <2 x double> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x double> %v, <2 x double> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #2
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) {
-  // CHECK-LABEL: test_vmulxq_laneq_f64_0
   return vmulxq_laneq_f64(a, v, 0);
-  // CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 int32x4_t test_vmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s16
   return vmull_high_n_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 int64x2_t test_vmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_s32
   return vmull_high_n_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL4_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I_I]]
 uint32x4_t test_vmull_high_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u16
   return vmull_high_n_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL2_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I_I]]
 uint64x2_t test_vmull_high_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_high_n_u32
   return vmull_high_n_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V4_I_I]]) #2
+// CHECK:   [[VQDMULL_V6_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s16
   return vqdmull_high_n_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V2_I_I]]) #2
+// CHECK:   [[VQDMULL_V4_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_high_n_s32
   return vqdmull_high_n_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s16
   return vmlal_high_n_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_s32
   return vmlal_high_n_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u16
   return vmlal_high_n_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_high_n_u32
   return vmlal_high_n_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I_I]]
 int32x4_t test_vqdmlal_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s16
   return vqdmlal_high_n_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I_I]]
 int64x2_t test_vqdmlal_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_n_s32
   return vqdmlal_high_n_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s16
   return vmlsl_high_n_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_s32
   return vmlsl_high_n_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_n_u16(uint32x4_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u16
   return vmlsl_high_n_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #2
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_n_u32(uint64x2_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_high_n_u32
   return vmlsl_high_n_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <4 x i16> [[VECINIT_I_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL4_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL5_I_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I_I]]
 int32x4_t test_vqdmlsl_high_n_s16(int32x4_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s16
   return vqdmlsl_high_n_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+\.h\[0\]|v[0-9]+\.8h}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VECINIT_I_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I_I:%.*]] = insertelement <2 x i32> [[VECINIT_I_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL2_I_I]]) #2
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL3_I_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I_I]]
 int64x2_t test_vqdmlsl_high_n_s32(int64x2_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_n_s32
   return vqdmlsl_high_n_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+\.s\[0\]|v[0-9]+\.4s}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
-  // CHECK-LABEL: test_vmul_n_f32
   return vmul_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
-  // CHECK-LABEL: test_vmulq_n_f32
   return vmulq_n_f32(a, b);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_n_f64(float64x2_t a, float64_t b) {
-  // CHECK-LABEL: test_vmulq_n_f64
   return vmulq_n_f64(a, b);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfma_n_f32
   return vfma_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmaq_n_f32
   return vfmaq_n_f32(a, b, n);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_n_f32(float32x2_t a, float32x2_t b, float32_t n) {
-  // CHECK-LABEL: test_vfms_n_f32
   return vfms_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %n, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %n, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %n, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %n, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_n_f32(float32x4_t a, float32x4_t b, float32_t n) {
-  // CHECK-LABEL: test_vfmsq_n_f32
   return vfmsq_n_f32(a, b, n);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmul_n_s16
   return vmul_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vmulq_n_s16
   return vmulq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmul_n_s32
   return vmul_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vmulq_n_s32
   return vmulq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmul_n_u16
   return vmul_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmulq_n_u16
   return vmulq_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmul_n_u32
   return vmul_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmulq_n_u32
   return vmulq_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vmull_n_s16
   return vmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vmull_n_s32
   return vmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #2
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
-  // CHECK-LABEL: test_vmull_n_u16
   return vmull_n_u16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
-  // CHECK-LABEL: test_vmull_n_u32
   return vmull_n_u32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #2
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s16
   return vqdmull_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #2
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmull_n_s32
   return vqdmull_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #2
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s16
   return vqdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #2
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s16
   return vqdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #2
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulh_n_s32
   return vqdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #2
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqdmulhq_n_s32
   return vqdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #2
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s16
   return vqrdmulh_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #2
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s16
   return vqrdmulhq_n_s16(a, b);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #2
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulh_n_s32
   return vqrdmulh_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #2
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
-  // CHECK-LABEL: test_vqrdmulhq_n_s32
   return vqrdmulhq_n_s32(a, b);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmla_n_s16
   return vmla_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s16
   return vmlaq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmla_n_s32
   return vmla_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_s32
   return vmlaq_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmla_n_u16
   return vmla_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u16
   return vmlaq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmla_n_u32
   return vmla_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_u32
   return vmlaq_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4s, w0
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlal_n_s16
   return vmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlal_n_s32
   return vmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlal_n_u16
   return vmlal_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlal_n_u32
   return vmlal_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s16
   return vqdmlal_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlal_n_s32
   return vqdmlal_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmls_n_s16
   return vmls_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s16
   return vmlsq_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmls_n_s32
   return vmls_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_s32
   return vmlsq_n_s32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmls_n_u16
   return vmls_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u16
   return vmlsq_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.8h, w0
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmls_n_u32
   return vmls_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_u32
   return vmlsq_n_u32(a, b, c);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s16
   return vmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_s32
   return vmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u16
   return vmlsl_n_u16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #2
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
-  // CHECK-LABEL: test_vmlsl_n_u32
   return vmlsl_n_u32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s16
   return vqdmlsl_n_s16(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.4h, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
-  // CHECK-LABEL: test_vqdmlsl_n_s32
   return vqdmlsl_n_s32(a, b, c);
-  // CHECK: dup {{v[0-9]+}}.2s, w0
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16_0
   return vmla_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16_0
   return vmlaq_lane_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32_0
   return vmla_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32_0
   return vmlaq_lane_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16_0
   return vmla_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16_0
   return vmlaq_laneq_u16(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32_0
   return vmla_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32_0
   return vmlaq_laneq_u32(a, b, v, 0);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16_0
   return vqdmlal_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32_0
   return vqdmlal_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16_0
   return vqdmlal_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32_0
   return vqdmlal_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16_0(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16_0
   return vmls_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16_0(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16_0
   return vmlsq_lane_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32_0(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32_0
   return vmls_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32_0(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32_0
   return vmlsq_lane_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16_0
   return vmls_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16_0(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16_0
   return vmlsq_laneq_u16(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32_0
   return vmls_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32_0(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32_0
   return vmlsq_laneq_u32(a, b, v, 0);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16_0
   return vqdmlsl_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32_0
   return vqdmlsl_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16_0
   return vqdmlsl_high_laneq_s16(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32_0
   return vqdmlsl_high_laneq_s32(a, b, v, 0);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16_0
   return vqdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16_0
   return vqdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32_0
   return vqdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32_0
   return vqdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16_0(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16_0
   return vqrdmulh_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16_0(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16_0
   return vqrdmulhq_laneq_s16(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32_0(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32_0
   return vqrdmulh_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> zeroinitializer
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32_0
   return vqrdmulhq_laneq_s32(a, v, 0);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmla_lane_u16
   return vmla_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u16
   return vmlaq_lane_u16(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_u32
   return vmla_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_u32
   return vmlaq_lane_u32(a, b, v, 1);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u16
   return vmla_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u16
   return vmlaq_laneq_u16(a, b, v, 7);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_u32
   return vmla_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_u32
   return vmlaq_laneq_u32(a, b, v, 3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s16
   return vqdmlal_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_laneq_s32
   return vqdmlal_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s16
   return vqdmlal_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlal_high_laneq_s32
   return vqdmlal_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmls_lane_u16
   return vmls_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u16
   return vmlsq_lane_u16(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[3]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_u32
   return vmls_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %v, <2 x i32> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_u32
   return vmlsq_lane_u32(a, b, v, 1);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_laneq_u16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u16
   return vmls_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_laneq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u16
   return vmlsq_laneq_u16(a, b, v, 7);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_laneq_u32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_u32
   return vmls_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_laneq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_u32
   return vmlsq_laneq_u32(a, b, v, 3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s16
   return vqdmlsl_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_laneq_s32
   return vqdmlsl_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s16
   return vqdmlsl_high_laneq_s16(a, b, v, 7);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #2
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #2
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmlsl_high_laneq_s32
   return vqdmlsl_high_laneq_s32(a, b, v, 3);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s16
   return vqdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s16
   return vqdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #2
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulh_laneq_s32
   return vqdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #2
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqdmulhq_laneq_s32
   return vqdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_laneq_s16(<4 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_laneq_s16(int16x4_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s16
   return vqrdmulh_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_laneq_s16(<8 x i16> %a, <8 x i16> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i16> %v, <8 x i16> %v, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_laneq_s16(int16x8_t a, int16x8_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s16
   return vqrdmulhq_laneq_s16(a, v, 7);
-  // CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[7]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_laneq_s32(<2 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #2
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_laneq_s32(int32x2_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulh_laneq_s32
   return vqrdmulh_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_laneq_s32(<4 x i32> %a, <4 x i32> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i32> %v, <4 x i32> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #2
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_laneq_s32(int32x4_t a, int32x4_t v) {
-  // CHECK-LABEL: test_vqrdmulhq_laneq_s32
   return vqrdmulhq_laneq_s32(a, v, 3);
-  // CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
diff --git a/test/CodeGen/aarch64-neon-3v.c b/test/CodeGen/aarch64-neon-3v.c
index ca3265214f93b..3581f780ffc68 100644
--- a/test/CodeGen/aarch64-neon-3v.c
+++ b/test/CodeGen/aarch64-neon-3v.c
@@ -1,486 +1,597 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vand_s8
   return vand_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vandq_s8
   return vandq_s8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vand_s16
   return vand_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vandq_s16
   return vandq_s16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vand_s32
   return vand_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vandq_s32
   return vandq_s32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vand_s64
   return vand_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vandq_s64
   return vandq_s64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vand_u8
   return vand_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vandq_u8
   return vandq_u8(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vand_u16
   return vand_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vandq_u16
   return vandq_u16(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vand_u32
   return vand_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vandq_u32
   return vandq_u32(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vand_u64
   return vand_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vandq_u64
   return vandq_u64(a, b);
-  // CHECK: and {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorr_s8
   return vorr_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vorrq_s8
   return vorrq_s8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorr_s16
   return vorr_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vorrq_s16
   return vorrq_s16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorr_s32
   return vorr_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vorrq_s32
   return vorrq_s32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorr_s64
   return vorr_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vorrq_s64
   return vorrq_s64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorr_u8
   return vorr_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vorrq_u8
   return vorrq_u8(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorr_u16
   return vorr_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vorrq_u16
   return vorrq_u16(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorr_u32
   return vorr_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vorrq_u32
   return vorrq_u32(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorr_u64
   return vorr_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vorrq_u64
   return vorrq_u64(a, b);
-  // CHECK: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_veor_s8
   return veor_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_veorq_s8
   return veorq_s8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_veor_s16
   return veor_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_veorq_s16
   return veorq_s16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_veor_s32
   return veor_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_veorq_s32
   return veorq_s32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_veor_s64
   return veor_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_veorq_s64
   return veorq_s64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_veor_u8
   return veor_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_veorq_u8
   return veorq_u8(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_veor_u16
   return veor_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_veorq_u16
   return veorq_u16(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_veor_u32
   return veor_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_veorq_u32
   return veorq_u32(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_veor_u64
   return veor_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_veorq_u64
   return veorq_u64(a, b);
-  // CHECK: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vbic_s8
   return vbic_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vbicq_s8
   return vbicq_s8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vbic_s16
   return vbic_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vbicq_s16
   return vbicq_s16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vbic_s32
   return vbic_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vbicq_s32
   return vbicq_s32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vbic_s64
   return vbic_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vbicq_s64
   return vbicq_s64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vbic_u8
   return vbic_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vbicq_u8
   return vbicq_u8(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vbic_u16
   return vbic_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vbicq_u16
   return vbicq_u16(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vbic_u32
   return vbic_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vbicq_u32
   return vbicq_u32(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vbic_u64
   return vbic_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vbicq_u64
   return vbicq_u64(a, b);
-  // CHECK: bic {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vorn_s8
   return vorn_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vornq_s8
   return vornq_s8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vorn_s16
   return vorn_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vornq_s16
   return vornq_s16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vorn_s32
   return vorn_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vornq_s32
   return vornq_s32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vorn_s64
   return vorn_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vornq_s64
   return vornq_s64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vorn_u8
   return vorn_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vornq_u8
   return vornq_u8(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vorn_u16
   return vorn_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vornq_u16
   return vornq_u16(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vorn_u32
   return vorn_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vornq_u32
   return vornq_u32(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vorn_u64
   return vorn_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vornq_u64
   return vornq_u64(a, b);
-  // CHECK: orn {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
diff --git a/test/CodeGen/aarch64-neon-across.c b/test/CodeGen/aarch64-neon-across.c
index 00eb2e45b0890..04a7b26e8a277 100644
--- a/test/CodeGen/aarch64-neon-across.c
+++ b/test/CodeGen/aarch64-neon-across.c
@@ -1,271 +1,398 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i16 @test_vaddlv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_s8
   return vaddlv_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_s16
   return vaddlv_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddlv_u8
   return vaddlv_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddlv_u16
   return vaddlv_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 int16_t test_vaddlvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_s8
   return vaddlvq_s8(a);
-  // CHECK: saddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.saddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 int32_t test_vaddlvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_s16
   return vaddlvq_s16(a);
-  // CHECK: saddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_S32_I]]
 int64_t test_vaddlvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_s32
   return vaddlvq_s32(a);
-  // CHECK: saddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i16 @test_vaddlvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDLV_I]] to i16
+// CHECK:   ret i16 [[TMP0]]
 uint16_t test_vaddlvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddlvq_u8
   return vaddlvq_u8(a);
-  // CHECK: uaddlv {{h[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i32 @test_vaddlvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDLV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddlv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDLV_I]]
 uint32_t test_vaddlvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddlvq_u16
   return vaddlvq_u16(a);
-  // CHECK: uaddlv {{s[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i64 @test_vaddlvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDLVQ_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i64 [[VADDLVQ_U32_I]]
 uint64_t test_vaddlvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddlvq_u32
   return vaddlvq_u32(a);
-  // CHECK: uaddlv {{d[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_s8
   return vmaxv_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_s16
   return vmaxv_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmaxv_u8
   return vmaxv_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmaxv_u16
   return vmaxv_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vmaxvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_s8
   return vmaxvq_s8(a);
-  // CHECK: smaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vmaxvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_s16
   return vmaxvq_s16(a);
-  // CHECK: smaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_S32_I]]
 int32_t test_vmaxvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_s32
   return vmaxvq_s32(a);
-  // CHECK: smaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vmaxvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMAXV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vmaxvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmaxvq_u8
   return vmaxvq_u8(a);
-  // CHECK: umaxv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vmaxvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXV_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMAXV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vmaxvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmaxvq_u16
   return vmaxvq_u16(a);
-  // CHECK: umaxv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vmaxvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMAXVQ_U32_I]]
 uint32_t test_vmaxvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_u32
   return vmaxvq_u32(a);
-  // CHECK: umaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vminv_s8
   return vminv_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vminv_s16
   return vminv_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vminv_u8
   return vminv_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vminv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vminv_u16
   return vminv_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vminvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vminvq_s8
   return vminvq_s8(a);
-  // CHECK: sminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vminvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vminvq_s16
   return vminvq_s16(a);
-  // CHECK: sminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_S32_I]]
 int32_t test_vminvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vminvq_s32
   return vminvq_s32(a);
-  // CHECK: sminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vminvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VMINV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vminvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vminvq_u8
   return vminvq_u8(a);
-  // CHECK: uminv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vminvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINV_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VMINV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vminvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vminvq_u16
   return vminvq_u16(a);
-  // CHECK: uminv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vminvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VMINVQ_U32_I]]
 uint32_t test_vminvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vminvq_u32
   return vminvq_u32(a);
-  // CHECK: uminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddv_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vaddv_s8
   return vaddv_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddv_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vaddv_s16
   return vaddv_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddv_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddv_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vaddv_u8
   return vaddv_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define i16 @test_vaddv_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddv_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vaddv_u16
   return vaddv_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 int8_t test_vaddvq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_s8
   return vaddvq_s8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vaddvq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_s16
   return vaddvq_s16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_S32_I]]
 int32_t test_vaddvq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_s32
   return vaddvq_s32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define i8 @test_vaddvq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a) #2
+// CHECK:   [[TMP0:%.*]] = trunc i32 [[VADDV_I]] to i8
+// CHECK:   ret i8 [[TMP0]]
 uint8_t test_vaddvq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vaddvq_u8
   return vaddvq_u8(a);
-  // CHECK: addv {{b[0-9]+}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i16 @test_vaddvq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VADDV_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> [[TMP1]]) #2
+// CHECK:   [[TMP2:%.*]] = trunc i32 [[VADDV_I]] to i16
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vaddvq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vaddvq_u16
   return vaddvq_u16(a);
-  // CHECK: addv {{h[0-9]+}}, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define i32 @test_vaddvq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VADDVQ_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> [[TMP1]]) #2
+// CHECK:   ret i32 [[VADDVQ_U32_I]]
 uint32_t test_vaddvq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_u32
   return vaddvq_u32(a);
-  // CHECK: addv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXVQ_F32_I]]
 float32_t test_vmaxvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxvq_f32
   return vmaxvq_f32(a);
-  // CHECK: fmaxv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINVQ_F32_I]]
 float32_t test_vminvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminvq_f32
   return vminvq_f32(a);
-  // CHECK: fminv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vmaxnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMAXNMVQ_F32_I]]
 float32_t test_vmaxnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f32
   return vmaxnmvq_f32(a);
-  // CHECK: fmaxnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define float @test_vminnmvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret float [[VMINNMVQ_F32_I]]
 float32_t test_vminnmvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vminnmvq_f32
   return vminnmvq_f32(a);
-  // CHECK: fminnmv {{s[0-9]+}}, {{v[0-9]+}}.4s
 }
diff --git a/test/CodeGen/aarch64-neon-extract.c b/test/CodeGen/aarch64-neon-extract.c
index cc654cc9bc3e4..c84c861b8742e 100644
--- a/test/CodeGen/aarch64-neon-extract.c
+++ b/test/CodeGen/aarch64-neon-extract.c
@@ -1,148 +1,247 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vext_s8
   return vext_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vext_s16
   return vext_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vext_s32
   return vext_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vext_s64
   return vext_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vextq_s8
   return vextq_s8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vextq_s16
   return vextq_s16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vextq_s32
   return vextq_s32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vextq_s64
   return vextq_s64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vext_u8
   return vext_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vext_u16
   return vext_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vext_u32
   return vext_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vext_u64
   return vext_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vextq_u8
   return vextq_u8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vextq_u16
   return vextq_u16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vextq_u32
   return vextq_u32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vextq_u64
   return vextq_u64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vext_f32
   return vext_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vext_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x double> [[TMP2]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[VEXT]]
 float64x1_t test_vext_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vext_f64
   return vext_f64(a, b, 0);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vextq_f32
   return vextq_f32(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?4}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vextq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x double> [[VEXT]]
 float64x2_t test_vextq_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vextq_f64
   return vextq_f64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?8}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vext_p8
   return vext_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vext_p16
   return vext_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{(0x)?6}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vextq_p8
   return vextq_p8(a, b, 2);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?2}}
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vextq_p16
   return vextq_p16(a, b, 3);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{(0x)?6}}
 }
diff --git a/test/CodeGen/aarch64-neon-fcvt-intrinsics.c b/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
index d1b9996b0b685..f2c238ebeb351 100644
--- a/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-fcvt-intrinsics.c
@@ -1,133 +1,153 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define float @test_vcvtxd_f32_f64(double %a) #0 {
+// CHECK:   [[VCVTXD_F32_F64_I:%.*]] = call float @llvm.aarch64.sisd.fcvtxn(double %a) #2
+// CHECK:   ret float [[VCVTXD_F32_F64_I]]
 float32_t test_vcvtxd_f32_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtxd_f32_f64
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
   return (float32_t)vcvtxd_f32_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtas.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_S32_F32_I]]
 int32_t test_vcvtas_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_s32_f32
-// CHECK: fcvtas {{[ws][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtas_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_test_vcvtad_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtas.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_S64_F64_I]]
 int64_t test_test_vcvtad_s64_f64(float64_t a) {
-// CHECK-LABEL: test_test_vcvtad_s64_f64
-// CHECK: fcvtas {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtad_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtas_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTAS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtau.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTAS_U32_F32_I]]
 uint32_t test_vcvtas_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtas_u32_f32
-// CHECK: fcvtau {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtas_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtad_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTAD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtau.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTAD_U64_F64_I]]
 uint64_t test_vcvtad_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtad_u64_f64
-// CHECK: fcvtau {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtad_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtms.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_S32_F32_I]]
 int32_t test_vcvtms_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_s32_f32
-// CHECK: fcvtms {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtms_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtms.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_S64_F64_I]]
 int64_t test_vcvtmd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_s64_f64
-// CHECK: fcvtms {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtmd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtms_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTMS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtmu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTMS_U32_F32_I]]
 uint32_t test_vcvtms_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtms_u32_f32
-// CHECK: fcvtmu {{[ws][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtms_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtmd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTMD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtmu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTMD_U64_F64_I]]
 uint64_t test_vcvtmd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtmd_u64_f64
-// CHECK: fcvtmu {{[xd][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtmd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtns.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_S32_F32_I]]
 int32_t test_vcvtns_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_s32_f32
-// CHECK: fcvtns {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtns_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtns.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_S64_F64_I]]
 int64_t test_vcvtnd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_s64_f64
-// CHECK: fcvtns {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtnd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtns_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTNS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtnu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTNS_U32_F32_I]]
 uint32_t test_vcvtns_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtns_u32_f32
-// CHECK: fcvtnu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtns_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtnd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTND_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtnu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTND_U64_F64_I]]
 uint64_t test_vcvtnd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtnd_u64_f64
-// CHECK: fcvtnu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtnd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_S32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtps.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_S32_F32_I]]
 int32_t test_vcvtps_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_s32_f32
-// CHECK: fcvtps {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvtps_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_S64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtps.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_S64_F64_I]]
 int64_t test_vcvtpd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_s64_f64
-// CHECK: fcvtps {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtpd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvtps_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTPS_U32_F32_I:%.*]] = call i32 @llvm.aarch64.neon.fcvtpu.i32.f32(float %a) #2
+// CHECK:   ret i32 [[VCVTPS_U32_F32_I]]
 uint32_t test_vcvtps_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvtps_u32_f32
-// CHECK: fcvtpu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvtps_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtpd_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTPD_U64_F64_I:%.*]] = call i64 @llvm.aarch64.neon.fcvtpu.i64.f64(double %a) #2
+// CHECK:   ret i64 [[VCVTPD_U64_F64_I]]
 uint64_t test_vcvtpd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtpd_u64_f64
-// CHECK: fcvtpu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtpd_u64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_s32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 int32_t test_vcvts_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_s32_f32
-// CHECK: fcvtzs {{[sw][0-9]+}}, {{s[0-9]+}}
   return (int32_t)vcvts_s32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_s64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptosi double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vcvtd_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_s64_f64
-// CHECK: fcvtzs {{[dx][0-9]+}}, {{d[0-9]+}}
   return (int64_t)vcvtd_s64_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_u32_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui float %a to i32
+// CHECK:   ret i32 [[TMP0]]
 uint32_t test_vcvts_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_u32_f32
-// CHECK: fcvtzu {{[sw][0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vcvts_u32_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_u64_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fptoui double %a to i64
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vcvtd_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_u64_f64
-// CHECK: fcvtzu {{[dx][0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vcvtd_u64_f64(a);
 }
diff --git a/test/CodeGen/aarch64-neon-fma.c b/test/CodeGen/aarch64-neon-fma.c
index ac808333365e7..836321af06093 100644
--- a/test/CodeGen/aarch64-neon-fma.c
+++ b/test/CodeGen/aarch64-neon-fma.c
@@ -1,199 +1,243 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmla_n_f32
   return vmla_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f32
   return vmlaq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlaq_n_f64
   return vmlaq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f32
   return vmlsq_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: dup {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
-  // CHECK-LABEL: test_vmls_n_f32
   return vmls_n_f32(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: dup {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %a, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vmlsq_n_f64
   return vmlsq_n_f64(a, b, c);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
-  // CHECK-FMA: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
-  // CHECK-FMA: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32_0
   return vmla_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32_0
   return vmlaq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32_0
   return vmla_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32_0
   return vmlaq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32_0
   return vmls_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32_0
   return vmlsq_lane_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32_0
   return vmls_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32_0
   return vmlsq_laneq_f32(a, b, v, 0);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmla_lane_f32
   return vmla_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlaq_lane_f32
   return vmlaq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmla_laneq_f32
   return vmla_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlaq_laneq_f32
   return vmlaq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmls_lane_f32
   return vmls_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) {
-  // CHECK-LABEL: test_vmlsq_lane_f32
   return vmlsq_lane_f32(a, b, v, 1);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[1]
 }
+// CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> <i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmls_laneq_f32
   return vmls_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
-  // CHECK-FMA: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) {
-  // CHECK-LABEL: test_vmlsq_laneq_f32
   return vmlsq_laneq_f32(a, b, v, 3);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK-FMA: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmaq_n_f64:
   return vfmaq_n_f64(a, b, c);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_n_f64(<2 x double> %a, <2 x double> %b, double %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %b
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> [[VECINIT1_I]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #2
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) {
-  // CHECK-LABEL: test_vfmsq_n_f64:
   return vfmsq_n_f64(a, b, c);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+\.2d|v[0-9]+\.d\[0\]}}
 }
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index b1207795900e2..b087ce91e568a 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -1,334 +1,394 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-ARM64
+// RUN:     -fallow-half-arguments-and-returns -ffp-contract=fast -S -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg \
+// RUN: | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vadd_s8
   return vadd_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vadd_s16
   return vadd_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vadd_s32
   return vadd_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t v1, int64x1_t v2) {
-  // CHECK-LABEL: test_vadd_s64
   return vadd_s64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vadd_f32
   return vadd_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vadd_u8
   return vadd_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vadd_u16
   return vadd_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vadd_u32
   return vadd_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vadd_u64
   return vadd_u64(v1, v2);
-  // CHECK: add {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_s8
   return vaddq_s8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_s16
   return vaddq_s16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vaddq_s32
   return vaddq_s32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_s64
   return vaddq_s64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vaddq_f32
   return vaddq_f32(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vaddq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vaddq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vaddq_f64
   return vaddq_f64(v1, v2);
-  // CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vaddq_u8
   return vaddq_u8(v1, v2);
-  // CHECK: add {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vaddq_u16
   return vaddq_u16(v1, v2);
-  // CHECK: add {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vaddq_u32
   return vaddq_u32(v1, v2);
-  // CHECK: add {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vaddq_u64
   return vaddq_u64(v1, v2);
-  // CHECK: add {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vsub_s8
   return vsub_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vsub_s16
   return vsub_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vsub_s32
   return vsub_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t v1, int64x1_t v2) {
-   // CHECK-LABEL: test_vsub_s64
   return vsub_s64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vsub_f32
   return vsub_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vsub_u8
   return vsub_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vsub_u16
   return vsub_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vsub_u32
   return vsub_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %v1, <1 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %v1, %v2
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t v1, uint64x1_t v2) {
-   // CHECK-LABEL: test_vsub_u64
   return vsub_u64(v1, v2);
-  // CHECK: sub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_s8
   return vsubq_s8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_s16
   return vsubq_s16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t v1,int32x4_t  v2) {
-   // CHECK-LABEL: test_vsubq_s32
   return vsubq_s32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_s64
   return vsubq_s64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vsubq_f32
   return vsubq_f32(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vsubq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vsubq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vsubq_f64
   return vsubq_f64(v1, v2);
-  // CHECK: fsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vsubq_u8
   return vsubq_u8(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vsubq_u16
   return vsubq_u16(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK: vsubq_u32
   return vsubq_u32(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %v1, %v2
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vsubq_u64
   return vsubq_u64(v1, v2);
-  // CHECK: sub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vmul_s8
   return vmul_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vmul_s16
   return vmul_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vmul_s32
   return vmul_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vmul_f32
   return vmul_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v1, %v2
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vmul_u8
   return vmul_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v1, %v2
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vmul_u16
   return vmul_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v1, %v2
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vmul_u32
   return vmul_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_s8
   return vmulq_s8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_s16
   return vmulq_s16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_s32
   return vmulq_s32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
     
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v1, %v2
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vmulq_u8
   return vmulq_u8(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v1, %v2
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vmulq_u16
   return vmulq_u16(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v1, %v2
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_u32
   return vmulq_u32(v1, v2);
-  // CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vmulq_f32
   return vmulq_f32(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[MUL_I]]
 float64x2_t test_vmulq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vmulq_f64
   return vmulq_f64(v1, v2);
-  // CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.pmul.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t v1, poly8x8_t v2) {
   //  test_vmul_p8
   return vmul_p8(v1, v2);
   //  pmul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmul.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
   // test_vmulq_p8
   return vmulq_p8(v1, v2);
@@ -336,1295 +396,2132 @@ poly8x16_t test_vmulq_p8(poly8x16_t v1, poly8x16_t v2) {
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmla_s8
   return vmla_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[ADD_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmla_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmla_s16
   return vmla_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmla_s32
   return vmla_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmla_f32
   return vmla_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmla_u8
   return vmla_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmla_u16
   return vmla_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmla_u32
   return vmla_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlaq_s8
   return vmlaq_s8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_s16
   return vmlaq_s16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_s32
   return vmlaq_s32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 } 
 
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_f32
   return vmlaq_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-   // CHECK-LABEL: test_vmlaq_u8
   return vmlaq_u8(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlaq_u16
   return vmlaq_u16(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlaq_u32
   return vmlaq_u32(v1, v2, v3);
-  // CHECK: mla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[ADD_I]]
 float64x2_t test_vmlaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlaq_f64
   return vmlaq_f64(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vmls_s8
   return vmls_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SUB_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vmls_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vmls_s16
   return vmls_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vmls_s32
   return vmls_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %v1, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vmls_f32
   return vmls_f32(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vmls_u8
   return vmls_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vmls_u16
   return vmls_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vmls_u32
   return vmls_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_s8
   return vmlsq_s8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_s16
   return vmlsq_s16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_s32
   return vmlsq_s32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %v1, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_f32
   return vmlsq_f32(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %v1, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vmlsq_u8
   return vmlsq_u8(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %v1, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vmlsq_u16
   return vmlsq_u16(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %v1, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vmlsq_u32
   return vmlsq_u32(v1, v2, v3);
-  // CHECK: mls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmlsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x double> %v2, %v3
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> %v1, [[MUL_I]]
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vmlsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vmlsq_f64
   return vmlsq_f64(v1, v2, v3);
-  // CHECK: fmls {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfma_f32
   return vfma_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmaq_f32
   return vfmaq_f32(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmaq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmaq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vfmaq_f64
   return vfmaq_f64(v1, v2, v3);
-  // CHECK: fmla {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfms_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vfms_f32
   return vfms_f32(v1, v2, v3);
-  // CHECK: fmls v0.2s, {{v1.2s, v2.2s|v2.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmsq_f32(float32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vfmsq_f32
   return vfmsq_f32(v1, v2, v3);
-  // CHECK: fmls v0.4s, {{v1.4s, v2.4s|v2.4s, v1.4s}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vfmsq_f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %v2
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[TMP6:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x double> [[TMP3]]) #4
+// CHECK:   ret <2 x double> [[TMP6]]
 float64x2_t test_vfmsq_f64(float64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK: vfmsq_f64
   return vfmsq_f64(v1, v2, v3);
-  // CHECK: fmls v0.2d, {{v1.2d, v2.2d|v2.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x double> @test_vdivq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x double> %v1, %v2
+// CHECK:   ret <2 x double> [[DIV_I]]
 float64x2_t test_vdivq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vdivq_f64
   return vdivq_f64(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vdivq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <4 x float> %v1, %v2
+// CHECK:   ret <4 x float> [[DIV_I]]
 float32x4_t test_vdivq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vdivq_f32
   return vdivq_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vdiv_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <2 x float> %v1, %v2
+// CHECK:   ret <2 x float> [[DIV_I]]
 float32x2_t test_vdiv_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vdiv_f32
   return vdiv_f32(v1, v2);
-  // CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vaba_s8
   return vaba_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vaba_s16
   return vaba_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vaba_s32
   return vaba_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v2, <8 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vaba_u8
   return vaba_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vaba_u16
   return vaba_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vaba_u32
   return vaba_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_s8
   return vabaq_s8(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_s16
   return vabaq_s16(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_s32
   return vabaq_s32(v1, v2, v3);
-  // CHECK: saba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v2, <16 x i8> %v3) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %v1, [[VABD_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vabaq_u8
   return vabaq_u8(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I_I]], <8 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %v1, [[VABD2_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vabaq_u16
   return vabaq_u16(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I_I]], <4 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %v1, [[VABD2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
-  // CHECK-LABEL: test_vabaq_u32
   return vabaq_u32(v1, v2, v3);
-  // CHECK: uaba {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 int8x8_t test_vabd_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vabd_s8
   return vabd_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 int16x4_t test_vabd_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vabd_s16
   return vabd_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 int32x2_t test_vabd_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vabd_s32
   return vabd_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VABD_I]]
 uint8x8_t test_vabd_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vabd_u8
   return vabd_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I]], <4 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VABD2_I]]
 uint16x4_t test_vabd_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vabd_u16
   return vabd_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I]], <2 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VABD2_I]]
 uint32x2_t test_vabd_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vabd_u32
   return vabd_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fabd.v2f32(<2 x float> [[VABD_I]], <2 x float> [[VABD1_I]]) #4
+// CHECK:   ret <2 x float> [[VABD2_I]]
 float32x2_t test_vabd_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vabd_f32
   return vabd_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 int8x16_t test_vabdq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_s8
   return vabdq_s8(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 int16x8_t test_vabdq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_s16
   return vabdq_s16(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 int32x4_t test_vabdq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_s32
   return vabdq_s32(v1, v2);
-  // CHECK: sabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VABD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uabd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VABD_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vabdq_u8
   return vabdq_u8(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uabd.v8i16(<8 x i16> [[VABD_I]], <8 x i16> [[VABD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VABD2_I]]
 uint16x8_t test_vabdq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vabdq_u16
   return vabdq_u16(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uabd.v4i32(<4 x i32> [[VABD_I]], <4 x i32> [[VABD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VABD2_I]]
 uint32x4_t test_vabdq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_u32
   return vabdq_u32(v1, v2);
-  // CHECK: uabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VABD2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fabd.v4f32(<4 x float> [[VABD_I]], <4 x float> [[VABD1_I]]) #4
+// CHECK:   ret <4 x float> [[VABD2_I]]
 float32x4_t test_vabdq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vabdq_f32
   return vabdq_f32(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vabdq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VABD2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fabd.v2f64(<2 x double> [[VABD_I]], <2 x double> [[VABD1_I]]) #4
+// CHECK:   ret <2 x double> [[VABD2_I]]
 float64x2_t test_vabdq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vabdq_f64
   return vabdq_f64(v1, v2);
-  // CHECK: fabd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 int8x8_t test_vbsl_s8(uint8x8_t v1, int8x8_t v2, int8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_s8
   return vbsl_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[VBSL5_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP4]]
 int8x8_t test_vbsl_s16(uint16x4_t v1, int16x4_t v2, int16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_s16
   return vbsl_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 int32x2_t test_vbsl_s32(uint32x2_t v1, int32x2_t v2, int32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_s32
   return vbsl_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_s64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_s64
   return vbsl_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t v1, uint8x8_t v2, uint8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_u8
   return vbsl_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 uint16x4_t test_vbsl_u16(uint16x4_t v1, uint16x4_t v2, uint16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_u16
   return vbsl_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i32> [[VBSL5_I]]
 uint32x2_t test_vbsl_u32(uint32x2_t v1, uint32x2_t v2, uint32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_u32
   return vbsl_u32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 uint64x1_t test_vbsl_u64(uint64x1_t v1, uint64x1_t v2, uint64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_u64
   return vbsl_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x float> %v1, <2 x float> %v2, <2 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x float> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP4:%.*]] = xor <2 x i32> [[VBSL_I]], <i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i32> [[TMP4]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[VBSL5_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP5]]
 float32x2_t test_vbsl_f32(float32x2_t v1, float32x2_t v2, float32x2_t v3) {
-  // CHECK-LABEL: test_vbsl_f32
   return vbsl_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <1 x double> @test_vbsl_f64(<1 x i64> %v1, <1 x double> %v2, <1 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[VBSL5_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vbsl_f64(uint64x1_t v1, float64x1_t v2, float64x1_t v3) {
-  // CHECK-LABEL: test_vbsl_f64
   return vbsl_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <8 x i8> [[VBSL2_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t v1, poly8x8_t v2, poly8x8_t v3) {
-  // CHECK-LABEL: test_vbsl_p8
   return vbsl_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %v3 to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i16> [[VBSL5_I]]
 poly16x4_t test_vbsl_p16(uint16x4_t v1, poly16x4_t v2, poly16x4_t v3) {
-  // CHECK-LABEL: test_vbsl_p16
   return vbsl_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 int8x16_t test_vbslq_s8(uint8x16_t v1, int8x16_t v2, int8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_s8
   return vbslq_s8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 int16x8_t test_vbslq_s16(uint16x8_t v1, int16x8_t v2, int16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_s16
   return vbslq_s16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_s32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_s32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 int64x2_t test_vbslq_s64(uint64x2_t v1, int64x2_t v2, int64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_s64
   return vbslq_s64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t v1, uint8x16_t v2, uint8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_u8
   return vbslq_u8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 uint16x8_t test_vbslq_u16(uint16x8_t v1, uint16x8_t v2, uint16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_u16
   return vbslq_u16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <4 x i32> [[VBSL5_I]]
 int32x4_t test_vbslq_u32(uint32x4_t v1, int32x4_t v2, int32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_u32
   return vbslq_s32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 uint64x2_t test_vbslq_u64(uint64x2_t v1, uint64x2_t v2, uint64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_u64
   return vbslq_u64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %v1, <4 x float> %v2, <4 x float> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VBSL3_I:%.*]] = and <4 x i32> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <4 x i32> [[VBSL_I]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <4 x i32> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <4 x i32> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[VBSL5_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP4]]
 float32x4_t test_vbslq_f32(uint32x4_t v1, float32x4_t v2, float32x4_t v3) {
-  // CHECK-LABEL: test_vbslq_f32
   return vbslq_f32(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) #0 {
+// CHECK:   [[VBSL_I:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP0:%.*]] = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], %v3
+// CHECK:   [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   ret <16 x i8> [[VBSL2_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t v1, poly8x16_t v2, poly8x16_t v3) {
-  // CHECK-LABEL: test_vbslq_p8
   return vbslq_p8(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VBSL3_I:%.*]] = and <8 x i16> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <8 x i16> [[VBSL_I]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <8 x i16> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <8 x i16> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <8 x i16> [[VBSL5_I]]
 poly16x8_t test_vbslq_p16(uint16x8_t v1, poly16x8_t v2, poly16x8_t v3) {
-  // CHECK-LABEL: test_vbslq_p16
   return vbslq_p16(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x double> @test_vbslq_f64(<2 x i64> %v1, <2 x double> %v2, <2 x double> %v3) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v3 to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[VBSL5_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP4]]
 float64x2_t test_vbslq_f64(uint64x2_t v1, float64x2_t v2, float64x2_t v3) {
-  // CHECK-LABEL: test_vbslq_f64
   return vbslq_f64(v1, v2, v3);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrecps_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrecps_f32
    return vrecps_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrecpsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f32
    return vrecpsq_f32(v1, v2);
-   // CHECK: frecps {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrecpsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecps.v2f64(<2 x double> [[VRECPSQ_V_I]], <2 x double> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <2 x double> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vrecpsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrecpsq_f64
   return vrecpsq_f64(v1, v2);
-  // CHECK: frecps {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrsqrts_f32(float32x2_t v1, float32x2_t v2) {
-   // CHECK-LABEL: test_vrsqrts_f32
   return vrsqrts_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t v1, float32x4_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f32
   return vrsqrtsq_f32(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrsqrtsq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrts.v2f64(<2 x double> [[VRSQRTSQ_V_I]], <2 x double> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <2 x double> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vrsqrtsq_f64(float64x2_t v1, float64x2_t v2) {
-   // CHECK-LABEL: test_vrsqrtsq_f64
   return vrsqrtsq_f64(v1, v2);
-  // CHECK: frsqrts {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcage_f32
   return vcage_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcage_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCAGE_V_I]], <1 x double> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCAGE_V2_I]]
 uint64x1_t test_vcage_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcage_f64
   return vcage_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcageq_f32
   return vcageq_f32(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcageq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCAGEQ_V_I]], <2 x double> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCAGEQ_V2_I]]
 uint64x2_t test_vcageq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcageq_f64
   return vcageq_f64(v1, v2);
-  // CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcagt_f32
   return vcagt_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcagt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCAGT_V_I]], <1 x double> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCAGT_V2_I]]
 uint64x1_t test_vcagt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcagt_f64
   return vcagt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcagtq_f32
   return vcagtq_f32(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcagtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCAGTQ_V_I]], <2 x double> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCAGTQ_V2_I]]
 uint64x2_t test_vcagtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcagtq_f64
   return vcagtq_f64(v1, v2);
-  // CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcale_f32
   return vcale_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcale_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facge.v1i64.v1f64(<1 x double> [[VCALE_V_I]], <1 x double> [[VCALE_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCALE_V2_I]]
 uint64x1_t test_vcale_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcale_f64
   return vcale_f64(a, b);
-  // CHECK: facge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaleq_f32
   return vcaleq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcaleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facge.v2i64.v2f64(<2 x double> [[VCALEQ_V_I]], <2 x double> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCALEQ_V2_I]]
 uint64x2_t test_vcaleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaleq_f64
   return vcaleq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %v2 to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.facgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcalt_f32
   return vcalt_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcalt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.facgt.v1i64.v1f64(<1 x double> [[VCALT_V_I]], <1 x double> [[VCALT_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VCALT_V2_I]]
 uint64x1_t test_vcalt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcalt_f64
   return vcalt_f64(a, b);
-  // CHECK: facgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.facgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcaltq_f32
   return vcaltq_f32(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcaltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %v2 to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.facgt.v2i64.v2f64(<2 x double> [[VCALTQ_V_I]], <2 x double> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VCALTQ_V2_I]]
 uint64x2_t test_vcaltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcaltq_f64
   return vcaltq_f64(v1, v2);
   // Using registers other than v0, v1 are possible, but would be odd.
-  // CHECK: facgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t v1, int8x8_t v2) {
-   // CHECK-LABEL: test_vtst_s8
   return vtst_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t v1, int16x4_t v2) {
-   // CHECK-LABEL: test_vtst_s16
   return vtst_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t v1, int32x2_t v2) {
-   // CHECK-LABEL: test_vtst_s32
   return vtst_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t v1, uint8x8_t v2) {
-   // CHECK-LABEL: test_vtst_u8
   return vtst_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t v1, uint16x4_t v2) {
-   // CHECK-LABEL: test_vtst_u16
   return vtst_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t v1, uint32x2_t v2) {
-   // CHECK-LABEL: test_vtst_u32
   return vtst_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t v1, int8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_s8
   return vtstq_s8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t v1, int16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_s16
   return vtstq_s16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t v1, int32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_s32
   return vtstq_s32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t v1, uint8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_u8
   return vtstq_u8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t v1, uint16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_u16
   return vtstq_u16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t v1, uint32x4_t v2) {
-   // CHECK-LABEL: test_vtstq_u32
   return vtstq_u32(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_s64(int64x2_t v1, int64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_s64
   return vtstq_s64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_u64(uint64x2_t v1, uint64x2_t v2) {
-   // CHECK-LABEL: test_vtstq_u64
   return vtstq_u64(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t v1, poly8x8_t v2) {
-   // CHECK-LABEL: test_vtst_p8
   return vtst_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t v1, poly16x4_t v2) {
-   // CHECK-LABEL: test_vtst_p16
   return vtst_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %v1, %v2
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t v1, poly8x16_t v2) {
-   // CHECK-LABEL: test_vtstq_p8
   return vtstq_p8(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t v1, poly16x8_t v2) {
-   // CHECK-LABEL: test_vtstq_p16
   return vtstq_p16(v1, v2);
-  // CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vtst_s64
   return vtst_s64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vtst_u64
   return vtst_u64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vceq_s8
   return vceq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vceq_s16
   return vceq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vceq_s32
   return vceq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vceq_s64
   return vceq_s64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vceq_u64
   return vceq_u64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vceq_f32
   return vceq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vceq_f64
   return vceq_f64(a, b);
-  // CHECK: fcmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vceq_u8
   return vceq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vceq_u16
   return vceq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vceq_u32
   return vceq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t v1, poly8x8_t v2) {
-  // CHECK-LABEL: test_vceq_p8
   return vceq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_s8
   return vceqq_s8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_s16
   return vceqq_s16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_s32
   return vceqq_s32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_f32
   return vceqq_f32(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_u8
   return vceqq_u8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vceqq_u16
   return vceqq_u16(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vceqq_u32
   return vceqq_u32(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t v1, poly8x16_t v2) {
-  // CHECK-LABEL: test_vceqq_p8
   return vceqq_p8(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_s64
   return vceqq_s64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_u64
   return vceqq_u64(v1, v2);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vceqq_f64
   return vceqq_f64(v1, v2);
-  // CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vcge_s8
   return vcge_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vcge_s16
   return vcge_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vcge_s32
   return vcge_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcge_s64
   return vcge_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcge_u64
   return vcge_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t v1, float32x2_t v2) {
-// CHECK-LABEL: test_vcge_f32
   return vcge_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcge_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcge_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcge_f64
   return vcge_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vcge_u8
   return vcge_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vcge_u16
   return vcge_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vcge_u32
   return vcge_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_s8
   return vcgeq_s8(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_s16
   return vcgeq_s16(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_s32
   return vcgeq_s32(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t v1, float32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_f32
   return vcgeq_f32(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vcgeq_u8
   return vcgeq_u8(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vcgeq_u16
   return vcgeq_u16(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vcgeq_u32
   return vcgeq_u32(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_s64(int64x2_t v1, int64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_s64
   return vcgeq_s64(v1, v2);
-// CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_u64(uint64x2_t v1, uint64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_u64
   return vcgeq_u64(v1, v2);
-// CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgeq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgeq_f64(float64x2_t v1, float64x2_t v2) {
-// CHECK-LABEL: test_vcgeq_f64
   return vcgeq_f64(v1, v2);
-// CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 // Notes about vcle:
 // LE condition predicate implemented as GE, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcle_s8
   return vcle_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcle_s16
   return vcle_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcle_s32
   return vcle_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcle_s64
   return vcle_s64(a, b);
-  // CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcle_u64
   return vcle_u64(a, b);
-  // CHECK: cmhs {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcle_f32
   return vcle_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcle_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcle_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcle_f64
   return vcle_f64(a, b);
-  // CHECK: fcmge {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcle_u8
   return vcle_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcle_u16
   return vcle_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcle_u32
   return vcle_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_s8
   return vcleq_s8(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_s16
   return vcleq_s16(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_s32
   return vcleq_s32(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_f32
   return vcleq_f32(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcleq_u8
   return vcleq_u8(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcleq_u16
   return vcleq_u16(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcleq_u32
   return vcleq_u32(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_s64
   return vcleq_s64(v1, v2);
-  // CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_u64
   return vcleq_u64(v1, v2);
-  // CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcleq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcleq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcleq_f64
   return vcleq_f64(v1, v2);
-  // CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_s8
   return vcgt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_s16
   return vcgt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_s32
   return vcgt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vcgt_s64
   return vcgt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vcgt_u64
   return vcgt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_f32
   return vcgt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcgt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vcgt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vcgt_f64
   return vcgt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vcgt_u8
   return vcgt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vcgt_u16
   return vcgt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vcgt_u32
   return vcgt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_s8
   return vcgtq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_s16
   return vcgtq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_s32
   return vcgtq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_f32
   return vcgtq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcgtq_u8
   return vcgtq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcgtq_u16
   return vcgtq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcgtq_u32
   return vcgtq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_s64
   return vcgtq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_u64
   return vcgtq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcgtq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcgtq_f64
   return vcgtq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
@@ -1632,10112 +2529,20029 @@ uint64x2_t test_vcgtq_f64(float64x2_t v1, float64x2_t v2) {
 // LT condition predicate implemented as GT, so check reversed operands.
 // Using registers other than v0, v1 are possible, but would be odd.
 
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t v1, int8x8_t v2) {
-  // CHECK-LABEL: test_vclt_s8
   return vclt_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t v1, int16x4_t v2) {
-  // CHECK-LABEL: test_vclt_s16
   return vclt_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t v1, int32x2_t v2) {
-  // CHECK-LABEL: test_vclt_s32
   return vclt_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_s64(int64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vclt_s64
   return vclt_s64(a, b);
-  // CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_u64(uint64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vclt_u64
   return vclt_u64(a, b);
-  // CHECK: cmhi {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %v1, <2 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t v1, float32x2_t v2) {
-  // CHECK-LABEL: test_vclt_f32
   return vclt_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vclt_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <1 x double> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vclt_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vclt_f64
   return vclt_f64(a, b);
-  // CHECK: fcmgt {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t v1, uint8x8_t v2) {
-  // CHECK-LABEL: test_vclt_u8
   return vclt_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t v1, uint16x4_t v2) {
-  // CHECK-LABEL: test_vclt_u16
   return vclt_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t v1, uint32x2_t v2) {
-  // CHECK-LABEL: test_vclt_u32
   return vclt_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t v1, int8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_s8
   return vcltq_s8(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t v1, int16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_s16
   return vcltq_s16(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t v1, int32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_s32
   return vcltq_s32(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %v1, <4 x float> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t v1, float32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_f32
   return vcltq_f32(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t v1, uint8x16_t v2) {
-  // CHECK-LABEL: test_vcltq_u8
   return vcltq_u8(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t v1, uint16x8_t v2) {
-  // CHECK-LABEL: test_vcltq_u16
   return vcltq_u16(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t v1, uint32x4_t v2) {
-  // CHECK-LABEL: test_vcltq_u32
   return vcltq_u32(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_s64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_s64(int64x2_t v1, int64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_s64
   return vcltq_s64(v1, v2);
-  // CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_u64(<2 x i64> %v1, <2 x i64> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i64> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_u64(uint64x2_t v1, uint64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_u64
   return vcltq_u64(v1, v2);
-  // CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcltq_f64(<2 x double> %v1, <2 x double> %v2) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x double> %v1, %v2
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vcltq_f64(float64x2_t v1, float64x2_t v2) {
-  // CHECK-LABEL: test_vcltq_f64
   return vcltq_f64(v1, v2);
-  // CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhadd_s8
   return vhadd_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhadd_s16
   return vhadd_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhadd_s32
   return vhadd_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhadd_u8
   return vhadd_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhadd.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhadd_u16
   return vhadd_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhadd.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhadd_u32
   return vhadd_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_s8
   return vhaddq_s8(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_s16
   return vhaddq_s16(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_s32
   return vhaddq_s32(v1, v2);
-  // CHECK: shadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhaddq_u8
   return vhaddq_u8(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhaddq_u16
   return vhaddq_u16(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhadd.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhaddq_u32
   return vhaddq_u32(v1, v2);
-  // CHECK: uhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.shsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vhsub_s8
   return vhsub_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.shsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhsub_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vhsub_s16
   return vhsub_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.shsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhsub_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vhsub_s32
   return vhsub_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uhsub.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vhsub_u8
   return vhsub_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uhsub.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhsub_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vhsub_u16
   return vhsub_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uhsub.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhsub_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vhsub_u32
   return vhsub_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.shsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_s8
   return vhsubq_s8(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.shsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhsubq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_s16
   return vhsubq_s16(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.shsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhsubq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_s32
   return vhsubq_s32(v1, v2);
-  // CHECK: shsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uhsub.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vhsubq_u8
   return vhsubq_u8(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uhsub.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhsubq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vhsubq_u16
   return vhsubq_u16(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uhsub.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhsubq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vhsubq_u32
   return vhsubq_u32(v1, v2);
-  // CHECK: uhsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t v1, int8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_s8
   return vrhadd_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrhadd_s16(int16x4_t v1, int16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_s16
   return vrhadd_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrhadd_s32(int32x2_t v1, int32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_s32
   return vrhadd_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %v1, <8 x i8> %v2) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %v1, <8 x i8> %v2) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t v1, uint8x8_t v2) {
-// CHECK-LABEL: test_vrhadd_u8
   return vrhadd_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %v1, <4 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urhadd.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrhadd_u16(uint16x4_t v1, uint16x4_t v2) {
-// CHECK-LABEL: test_vrhadd_u16
   return vrhadd_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %v1, <2 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %v1 to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %v2 to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urhadd.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrhadd_u32(uint32x2_t v1, uint32x2_t v2) {
-// CHECK-LABEL: test_vrhadd_u32
   return vrhadd_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t v1, int8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_s8
   return vrhaddq_s8(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrhaddq_s16(int16x8_t v1, int16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_s16
   return vrhaddq_s16(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrhaddq_s32(int32x4_t v1, int32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_s32
   return vrhaddq_s32(v1, v2);
-// CHECK: srhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %v1, <16 x i8> %v2) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urhadd.v16i8(<16 x i8> %v1, <16 x i8> %v2) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t v1, uint8x16_t v2) {
-// CHECK-LABEL: test_vrhaddq_u8
   return vrhaddq_u8(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %v1, <8 x i16> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t v1, uint16x8_t v2) {
-// CHECK-LABEL: test_vrhaddq_u16
   return vrhaddq_u16(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %v1, <4 x i32> %v2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %v1 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %v2 to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urhadd.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t v1, uint32x4_t v2) {
-// CHECK-LABEL: test_vrhaddq_u32
   return vrhaddq_u32(v1, v2);
-// CHECK: urhadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqadd_s8
   return vqadd_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqadd_s16
   return vqadd_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqadd_s32
   return vqadd_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqadd_s64
   return vqadd_s64(a, b);
-// CHECK:  sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqadd_u8
   return vqadd_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqadd_u16
   return vqadd_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqadd.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqadd_u32
   return vqadd_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqadd.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK:  test_vqadd_u64
   return vqadd_u64(a, b);
-// CHECK:  uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqaddq_s8
   return vqaddq_s8(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqaddq_s16
   return vqaddq_s16(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqaddq_s32
   return vqaddq_s32(a, b);
-  // CHECK: sqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqaddq_s64
   return vqaddq_s64(a, b);
-// CHECK: sqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqaddq_u8
   return vqaddq_u8(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqadd.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqaddq_u16
   return vqaddq_u16(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqadd.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqaddq_u32
   return vqaddq_u32(a, b);
-  // CHECK: uqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqadd.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqaddq_u64
   return vqaddq_u64(a, b);
-// CHECK: uqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqsub_s8
   return vqsub_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqsub_s16
   return vqsub_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqsub_s32
   return vqsub_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqsub_s64
   return vqsub_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vqsub_u8
   return vqsub_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vqsub_u16
   return vqsub_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqsub.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vqsub_u32
   return vqsub_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqsub.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vqsub_u64
   return vqsub_u64(a, b);
-// CHECK:  uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqsubq_s8
   return vqsubq_s8(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqsubq_s16
   return vqsubq_s16(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqsubq_s32
   return vqsubq_s32(a, b);
-  // CHECK: sqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqsubq_s64
   return vqsubq_s64(a, b);
-// CHECK: sqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqsub.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vqsubq_u8
   return vqsubq_u8(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqsub.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vqsubq_u16
   return vqsubq_u16(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqsub.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vqsubq_u32
   return vqsubq_u32(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqsub.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
-// CHECK-LABEL: test_vqsubq_u64
   return vqsubq_u64(a, b);
-  // CHECK: uqsub {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_s8
   return vshl_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sshl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_s16
   return vshl_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sshl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_s32
   return vshl_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sshl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_s64
   return vshl_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.ushl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vshl_u8
   return vshl_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.ushl.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vshl_u16
   return vshl_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ushl.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vshl_u32
   return vshl_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.ushl.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vshl_u64
   return vshl_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_s8
   return vshlq_s8(a, b);
-// CHECK: sshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sshl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_s16
   return vshlq_s16(a, b);
-// CHECK: sshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sshl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_s32
   return vshlq_s32(a, b);
-// CHECK: sshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sshl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_s64
   return vshlq_s64(a, b);
-// CHECK: sshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.ushl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vshlq_u8
   return vshlq_u8(a, b);
-// CHECK: ushl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.ushl.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vshlq_u16
   return vshlq_u16(a, b);
-// CHECK: ushl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ushl.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vshlq_u32
   return vshlq_u32(a, b);
-// CHECK: ushl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.ushl.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vshlq_u64
   return vshlq_u64(a, b);
-// CHECK: ushl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_s8
   return vqshl_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_s16
   return vqshl_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_s32
   return vqshl_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_s64
   return vqshl_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqshl_u8
   return vqshl_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqshl_u16
   return vqshl_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqshl_u32
   return vqshl_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqshl_u64
   return vqshl_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_s8
   return vqshlq_s8(a, b);
-// CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_s16
   return vqshlq_s16(a, b);
-// CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_s32
   return vqshlq_s32(a, b);
-// CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_s64
   return vqshlq_s64(a, b);
-// CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqshlq_u8
   return vqshlq_u8(a, b);
-// CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqshlq_u16
   return vqshlq_u16(a, b);
-// CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqshlq_u32
   return vqshlq_u32(a, b);
-// CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqshlq_u64
   return vqshlq_u64(a, b);
-// CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_s8
   return vrshl_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_s16
   return vrshl_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_s32
   return vrshl_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_s64
   return vrshl_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vrshl_u8
   return vrshl_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vrshl_u16
   return vrshl_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vrshl_u32
   return vrshl_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrshl_u64
   return vrshl_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_s8
   return vrshlq_s8(a, b);
-// CHECK: srshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_s16
   return vrshlq_s16(a, b);
-// CHECK: srshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_s32
   return vrshlq_s32(a, b);
-// CHECK: srshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_s64
   return vrshlq_s64(a, b);
-// CHECK: srshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vrshlq_u8
   return vrshlq_u8(a, b);
-// CHECK: urshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vrshlq_u16
   return vrshlq_u16(a, b);
-// CHECK: urshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vrshlq_u32
   return vrshlq_u32(a, b);
-// CHECK: urshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vrshlq_u64
   return vrshlq_u64(a, b);
-// CHECK: urshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_s8
   return vqrshl_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_s16
   return vqrshl_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_s32
   return vqrshl_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_s64
   return vqrshl_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vqrshl_u8
   return vqrshl_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrshl_u16
   return vqrshl_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshl.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrshl_u32
   return vqrshl_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqrshl.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vqrshl_u64
   return vqrshl_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vqrshlq_s8
   return vqrshlq_s8(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_s16
   return vqrshlq_s16(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_s32
   return vqrshlq_s32(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_s64
   return vqrshlq_s64(a, b);
-// CHECK: sqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqrshl.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqrshl.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrshlq_u16
   return vqrshlq_u16(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqrshl.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrshlq_u32
   return vqrshlq_u32(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqrshl.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
-// CHECK-LABEL: test_vqrshlq_u64
   return vqrshlq_u64(a, b);
-// CHECK: uqrshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 0)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 poly64x1_t test_vsli_n_p64(poly64x1_t a, poly64x1_t b) {
-// CHECK-LABEL: test_vsli_n_p64
   return vsli_n_p64(a, b, 0); 
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 0)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 poly64x2_t test_vsliq_n_p64(poly64x2_t a, poly64x2_t b) {
-// CHECK-LABEL: test_vsliq_n_p64
   return vsliq_n_p64(a, b, 0); 
-// CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmax_s8
   return vmax_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmax_s16
   return vmax_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmax_s32
   return vmax_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umax.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmax_u8
   return vmax_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umax.v4i16(<4 x i16> [[VMAX_I]], <4 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMAX2_I]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmax_u16
   return vmax_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umax.v2i32(<2 x i32> [[VMAX_I]], <2 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMAX2_I]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmax_u32
   return vmax_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmax.v2f32(<2 x float> [[VMAX_I]], <2 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAX2_I]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmax_f32
   return vmax_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vmaxq_s8
   return vmaxq_s8(a, b);
-// CHECK: smax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vmaxq_s16
   return vmaxq_s16(a, b);
-// CHECK: smax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vmaxq_s32
   return vmaxq_s32(a, b);
-// CHECK: smax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umax.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAX_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vmaxq_u8
   return vmaxq_u8(a, b);
-// CHECK: umax {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umax.v8i16(<8 x i16> [[VMAX_I]], <8 x i16> [[VMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMAX2_I]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vmaxq_u16
   return vmaxq_u16(a, b);
-// CHECK: umax {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umax.v4i32(<4 x i32> [[VMAX_I]], <4 x i32> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMAX2_I]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vmaxq_u32
   return vmaxq_u32(a, b);
-// CHECK: umax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmax.v4f32(<4 x float> [[VMAX_I]], <4 x float> [[VMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAX2_I]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxq_f32
   return vmaxq_f32(a, b);
-// CHECK: fmax {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmax.v2f64(<2 x double> [[VMAX_I]], <2 x double> [[VMAX1_I]]) #4
+// CHECK:   ret <2 x double> [[VMAX2_I]]
 float64x2_t test_vmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxq_f64
   return vmaxq_f64(a, b);
-// CHECK: fmax {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vmin_s8
   return vmin_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vmin_s16
   return vmin_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vmin_s32
   return vmin_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umin.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vmin_u8
   return vmin_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umin.v4i16(<4 x i16> [[VMIN_I]], <4 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VMIN2_I]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vmin_u16
   return vmin_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umin.v2i32(<2 x i32> [[VMIN_I]], <2 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VMIN2_I]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vmin_u32
   return vmin_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmin.v2f32(<2 x float> [[VMIN_I]], <2 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VMIN2_I]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmin_f32
   return vmin_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vminq_s8
   return vminq_s8(a, b);
-// CHECK: smin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vminq_s16
   return vminq_s16(a, b);
-// CHECK: smin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vminq_s32
   return vminq_s32(a, b);
-// CHECK: smin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umin.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMIN_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vminq_u8
   return vminq_u8(a, b);
-// CHECK: umin {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umin.v8i16(<8 x i16> [[VMIN_I]], <8 x i16> [[VMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VMIN2_I]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vminq_u16
   return vminq_u16(a, b);
-// CHECK: umin {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umin.v4i32(<4 x i32> [[VMIN_I]], <4 x i32> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMIN2_I]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vminq_u32
   return vminq_u32(a, b);
-// CHECK: umin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmin.v4f32(<4 x float> [[VMIN_I]], <4 x float> [[VMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VMIN2_I]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminq_f32
   return vminq_f32(a, b);
-// CHECK: fmin {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmin.v2f64(<2 x double> [[VMIN_I]], <2 x double> [[VMIN1_I]]) #4
+// CHECK:   ret <2 x double> [[VMIN2_I]]
 float64x2_t test_vminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminq_f64
   return vminq_f64(a, b);
-// CHECK: fmin {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnm.v2f32(<2 x float> [[VMAXNM_I]], <2 x float> [[VMAXNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VMAXNM2_I]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmaxnm_f32
   return vmaxnm_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnm.v4f32(<4 x float> [[VMAXNM_I]], <4 x float> [[VMAXNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VMAXNM2_I]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmaxnmq_f32
   return vmaxnmq_f32(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnm.v2f64(<2 x double> [[VMAXNM_I]], <2 x double> [[VMAXNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VMAXNM2_I]]
 float64x2_t test_vmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmaxnmq_f64
   return vmaxnmq_f64(a, b);
-// CHECK: fmaxnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnm.v2f32(<2 x float> [[VMINNM_I]], <2 x float> [[VMINNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VMINNM2_I]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vminnm_f32
   return vminnm_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnm.v4f32(<4 x float> [[VMINNM_I]], <4 x float> [[VMINNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VMINNM2_I]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vminnmq_f32
   return vminnmq_f32(a, b);
-// CHECK: fminnm {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnm.v2f64(<2 x double> [[VMINNM_I]], <2 x double> [[VMINNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VMINNM2_I]]
 float64x2_t test_vminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vminnmq_f64
   return vminnmq_f64(a, b);
-// CHECK: fminnm {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.smaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmax_s8
   return vpmax_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.smaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmax_s16
   return vpmax_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.smaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmax_s32
   return vpmax_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.umaxp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmax_u8
   return vpmax_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.umaxp.v4i16(<4 x i16> [[VPMAX_I]], <4 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMAX2_I]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmax_u16
   return vpmax_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.umaxp.v2i32(<2 x i32> [[VPMAX_I]], <2 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMAX2_I]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmax_u32
   return vpmax_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxp.v2f32(<2 x float> [[VPMAX_I]], <2 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAX2_I]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmax_f32
   return vpmax_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.smaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 int8x16_t test_vpmaxq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_s8
   return vpmaxq_s8(a, b);
-// CHECK: smaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 int16x8_t test_vpmaxq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_s16
   return vpmaxq_s16(a, b);
-// CHECK: smaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 int32x4_t test_vpmaxq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_s32
   return vpmaxq_s32(a, b);
-// CHECK: smaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMAX_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.umaxp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMAX_I]]
 uint8x16_t test_vpmaxq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpmaxq_u8
   return vpmaxq_u8(a, b);
-// CHECK: umaxp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMAX2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umaxp.v8i16(<8 x i16> [[VPMAX_I]], <8 x i16> [[VPMAX1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMAX2_I]]
 uint16x8_t test_vpmaxq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpmaxq_u16
   return vpmaxq_u16(a, b);
-// CHECK: umaxp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umaxp.v4i32(<4 x i32> [[VPMAX_I]], <4 x i32> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMAX2_I]]
 uint32x4_t test_vpmaxq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_u32
   return vpmaxq_u32(a, b);
-// CHECK: umaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMAX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxp.v4f32(<4 x float> [[VPMAX_I]], <4 x float> [[VPMAX1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMAX2_I]]
 float32x4_t test_vpmaxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxq_f32
   return vpmaxq_f32(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpmaxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMAX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxp.v2f64(<2 x double> [[VPMAX_I]], <2 x double> [[VPMAX1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMAX2_I]]
 float64x2_t test_vpmaxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxq_f64
   return vpmaxq_f64(a, b);
-// CHECK: fmaxp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpmin_s8
   return vpmin_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpmin_s16
   return vpmin_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpmin_s32
   return vpmin_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uminp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpmin_u8
   return vpmin_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uminp.v4i16(<4 x i16> [[VPMIN_I]], <4 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i16> [[VPMIN2_I]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpmin_u16
   return vpmin_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uminp.v2i32(<2 x i32> [[VPMIN_I]], <2 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPMIN2_I]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpmin_u32
   return vpmin_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminp.v2f32(<2 x float> [[VPMIN_I]], <2 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMIN2_I]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmin_f32
   return vpmin_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 int8x16_t test_vpminq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpminq_s8
   return vpminq_s8(a, b);
-// CHECK: sminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 int16x8_t test_vpminq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpminq_s16
   return vpminq_s16(a, b);
-// CHECK: sminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 int32x4_t test_vpminq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpminq_s32
   return vpminq_s32(a, b);
-// CHECK: sminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPMIN_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.uminp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPMIN_I]]
 uint8x16_t test_vpminq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpminq_u8
   return vpminq_u8(a, b);
-// CHECK: uminp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPMIN2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uminp.v8i16(<8 x i16> [[VPMIN_I]], <8 x i16> [[VPMIN1_I]]) #4
+// CHECK:   ret <8 x i16> [[VPMIN2_I]]
 uint16x8_t test_vpminq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpminq_u16
   return vpminq_u16(a, b);
-// CHECK: uminp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uminp.v4i32(<4 x i32> [[VPMIN_I]], <4 x i32> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPMIN2_I]]
 uint32x4_t test_vpminq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpminq_u32
   return vpminq_u32(a, b);
-// CHECK: uminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMIN2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminp.v4f32(<4 x float> [[VPMIN_I]], <4 x float> [[VPMIN1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMIN2_I]]
 float32x4_t test_vpminq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminq_f32
   return vpminq_f32(a, b);
-// CHECK: fminp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpminq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMIN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMIN1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMIN2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminp.v2f64(<2 x double> [[VPMIN_I]], <2 x double> [[VPMIN1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMIN2_I]]
 float64x2_t test_vpminq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminq_f64
   return vpminq_f64(a, b);
-// CHECK: fminp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmaxnmp.v2f32(<2 x float> [[VPMAXNM_I]], <2 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMAXNM2_I]]
 float32x2_t test_vpmaxnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpmaxnm_f32
   return vpmaxnm_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmaxnmp.v4f32(<4 x float> [[VPMAXNM_I]], <4 x float> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMAXNM2_I]]
 float32x4_t test_vpmaxnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f32
   return vpmaxnmq_f32(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpmaxnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMAXNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMAXNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmaxnmp.v2f64(<2 x double> [[VPMAXNM_I]], <2 x double> [[VPMAXNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMAXNM2_I]]
 float64x2_t test_vpmaxnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpmaxnmq_f64
   return vpmaxnmq_f64(a, b);
-// CHECK: fmaxnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fminnmp.v2f32(<2 x float> [[VPMINNM_I]], <2 x float> [[VPMINNM1_I]]) #4
+// CHECK:   ret <2 x float> [[VPMINNM2_I]]
 float32x2_t test_vpminnm_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpminnm_f32
   return vpminnm_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fminnmp.v4f32(<4 x float> [[VPMINNM_I]], <4 x float> [[VPMINNM1_I]]) #4
+// CHECK:   ret <4 x float> [[VPMINNM2_I]]
 float32x4_t test_vpminnmq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpminnmq_f32
   return vpminnmq_f32(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpminnmq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPMINNM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINNM1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPMINNM2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fminnmp.v2f64(<2 x double> [[VPMINNM_I]], <2 x double> [[VPMINNM1_I]]) #4
+// CHECK:   ret <2 x double> [[VPMINNM2_I]]
 float64x2_t test_vpminnmq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpminnmq_f64
   return vpminnmq_f64(a, b);
-// CHECK: fminnmp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
-// CHECK-LABEL: test_vpadd_s8
   return vpadd_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vpadd_s16
   return vpadd_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vpadd_s32
   return vpadd_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
-// CHECK-LABEL: test_vpadd_u8
   return vpadd_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
-// CHECK-LABEL: test_vpadd_u16
   return vpadd_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
-// CHECK-LABEL: test_vpadd_u32
   return vpadd_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vpadd_f32
   return vpadd_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 int8x16_t test_vpaddq_s8(int8x16_t a, int8x16_t b) {
-// CHECK-LABEL: test_vpaddq_s8
   return vpaddq_s8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpaddq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vpaddq_s16
   return vpaddq_s16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vpaddq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vpaddq_s32
   return vpaddq_s32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vpaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VPADDQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VPADDQ_V_I]]
 uint8x16_t test_vpaddq_u8(uint8x16_t a, uint8x16_t b) {
-// CHECK-LABEL: test_vpaddq_u8
   return vpaddq_u8(a, b);
-// CHECK: addp {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> [[VPADDQ_V_I]], <8 x i16> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpaddq_u16(uint16x8_t a, uint16x8_t b) {
-// CHECK-LABEL: test_vpaddq_u16
   return vpaddq_u16(a, b);
-// CHECK: addp {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> [[VPADDQ_V_I]], <4 x i32> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
-// CHECK-LABEL: test_vpaddq_u32
   return vpaddq_u32(a, b);
-// CHECK: addp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vpaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> [[VPADDQ_V_I]], <4 x float> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vpaddq_f32
   return vpaddq_f32(a, b);
-// CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vpaddq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> [[VPADDQ_V_I]], <2 x double> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x double>
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vpaddq_f64
   return vpaddq_f64(a, b);
-// CHECK: faddp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqdmulh_s16
   return vqdmulh_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqdmulh_s32
   return vqdmulh_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqdmulhq_s16
   return vqdmulhq_s16(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqdmulhq_s32
   return vqdmulhq_s32(a, b);
-// CHECK: sqdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
-// CHECK-LABEL: test_vqrdmulh_s16
   return vqrdmulh_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
-// CHECK-LABEL: test_vqrdmulh_s32
   return vqrdmulh_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s16
   return vqrdmulhq_s16(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
-// CHECK-LABEL: test_vqrdmulhq_s32
   return vqrdmulhq_s32(a, b);
-// CHECK: sqrdmulh {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vmulx_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[VMULX_I]], <2 x float> [[VMULX1_I]]) #4
+// CHECK:   ret <2 x float> [[VMULX2_I]]
 float32x2_t test_vmulx_f32(float32x2_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulx_f32
   return vmulx_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmulxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[VMULX_I]], <4 x float> [[VMULX1_I]]) #4
+// CHECK:   ret <4 x float> [[VMULX2_I]]
 float32x4_t test_vmulxq_f32(float32x4_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxq_f32
   return vmulxq_f32(a, b);
-// CHECK: fmulx {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vmulxq_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[VMULX_I]], <2 x double> [[VMULX1_I]]) #4
+// CHECK:   ret <2 x double> [[VMULX2_I]]
 float64x2_t test_vmulxq_f64(float64x2_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxq_f64
   return vmulxq_f64(a, b);
-// CHECK: fmulx {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_s8
   return vshl_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_s16
   return vshl_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_s32
   return vshl_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_s8
   return vshlq_n_s8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_s16
   return vshlq_n_s16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_s32
   return vshlq_n_s32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_s64
   return vshlq_n_s64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_u8(int8x8_t a) {
-// CHECK-LABEL: test_vshl_n_u8
   return vshl_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_u16(int16x4_t a) {
-// CHECK-LABEL: test_vshl_n_u16
   return vshl_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_u32(int32x2_t a) {
-// CHECK-LABEL: test_vshl_n_u32
   return vshl_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_u8(int8x16_t a) {
-// CHECK-LABEL: test_vshlq_n_u8
   return vshlq_n_u8(a, 3);
-// CHECK: shl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_u16(int16x8_t a) {
-// CHECK-LABEL: test_vshlq_n_u16
   return vshlq_n_u16(a, 3);
-// CHECK: shl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_u32(int32x4_t a) {
-// CHECK-LABEL: test_vshlq_n_u32
   return vshlq_n_u32(a, 3);
-// CHECK: shl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_u64(int64x2_t a) {
-// CHECK-LABEL: test_vshlq_n_u64
   return vshlq_n_u64(a, 3);
-// CHECK: shl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_s8
   return vshr_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_s16
   return vshr_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_s32
   return vshr_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_s8
   return vshrq_n_s8(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_s16
   return vshrq_n_s16(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_s32
   return vshrq_n_s32(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_s64
   return vshrq_n_s64(a, 3);
-  // CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vshr_n_u8
   return vshr_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vshr_n_u16
   return vshr_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 3, i32 3>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vshr_n_u32
   return vshr_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vshrq_n_u8
   return vshrq_n_u8(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrq_n_u16
   return vshrq_n_u16(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrq_n_u32
   return vshrq_n_u32(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 3, i64 3>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrq_n_u64
   return vshrq_n_u64(a, 3);
-  // CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_s8
   return vsra_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_s16
   return vsra_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_s32
   return vsra_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_s8
   return vsraq_n_s8(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_s16
   return vsraq_n_s16(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_s32
   return vsraq_n_s32(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_s64
   return vsraq_n_s64(a, b, 3);
-  // CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsra_n_u8
   return vsra_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsra_n_u16
   return vsra_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsra_n_u32
   return vsra_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsraq_n_u8
   return vsraq_n_u8(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsraq_n_u16
   return vsraq_n_u16(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsraq_n_u32
   return vsraq_n_u32(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 3, i64 3>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsraq_n_u64
   return vsraq_n_u64(a, b, 3);
-  // CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_s8
   return vrshr_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_s16
   return vrshr_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_s32
   return vrshr_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s8
   return vrshrq_n_s8(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s16
   return vrshrq_n_s16(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s32
   return vrshrq_n_s32(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_s64
   return vrshrq_n_s64(a, 3);
-  // CHECK: srshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %a, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_u8(int8x8_t a) {
-  // CHECK-LABEL: test_vrshr_n_u8
   return vrshr_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_u16(int16x4_t a) {
-  // CHECK-LABEL: test_vrshr_n_u16
   return vrshr_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_u32(int32x2_t a) {
-  // CHECK-LABEL: test_vrshr_n_u32
   return vrshr_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %a, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_u8(int8x16_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u8
   return vrshrq_n_u8(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_u16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u16
   return vrshrq_n_u16(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_u32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u32
   return vrshrq_n_u32(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_u64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrq_n_u64
   return vrshrq_n_u64(a, 3);
-  // CHECK: urshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_s8
   return vrsra_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_s16
   return vrsra_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_s32
   return vrsra_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s8
   return vrsraq_n_s8(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s16
   return vrsraq_n_s16(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s32
   return vrsraq_n_s32(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_s64
   return vrsraq_n_s64(a, b, 3);
-  // CHECK: srsra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %b, <8 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vrsra_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vrsra_n_u8
   return vrsra_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = add <4 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vrsra_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vrsra_n_u16
   return vrsra_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vrsra_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vrsra_n_u32
   return vrsra_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %b, <16 x i8> <i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3, i8 -3>)
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VRSHR_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vrsraq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u8
   return vrsraq_n_u8(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = add <8 x i16> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vrsraq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u16
   return vrsraq_n_u16(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -3, i32 -3, i32 -3, i32 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vrsraq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u32
   return vrsraq_n_u32(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -3, i64 -3>)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vrsraq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsraq_n_u64
   return vrsraq_n_u64(a, b, 3);
-  // CHECK: ursra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_s8
   return vsri_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_s16
   return vsri_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N2]]
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_s32
   return vsri_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_s8
   return vsriq_n_s8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_s16
   return vsriq_n_s16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N2]]
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_s32
   return vsriq_n_s32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_s64
   return vsriq_n_s64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 int8x8_t test_vsri_n_u8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_u8
   return vsri_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 int16x4_t test_vsri_n_u16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_u16
   return vsri_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsri.v2i32(<2 x i32> [[VSRI_N]], <2 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSRI_N2]]
 int32x2_t test_vsri_n_u32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsri_n_u32
   return vsri_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 int8x16_t test_vsriq_n_u8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_u8
   return vsriq_n_u8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 int16x8_t test_vsriq_n_u16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_u16
   return vsriq_n_u16(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsri.v4i32(<4 x i32> [[VSRI_N]], <4 x i32> [[VSRI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSRI_N2]]
 int32x4_t test_vsriq_n_u32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsriq_n_u32
   return vsriq_n_u32(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 int64x2_t test_vsriq_n_u64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_u64
   return vsriq_n_u64(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsri.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSRI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsri_n_p8
   return vsri_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsri.v4i16(<4 x i16> [[VSRI_N]], <4 x i16> [[VSRI_N1]], i32 15)
+// CHECK:   ret <4 x i16> [[VSRI_N2]]
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsri_n_p16
   return vsri_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsri.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSRI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsriq_n_p8
   return vsriq_n_p8(a, b, 3);
-  // CHECK: sri {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsri.v8i16(<8 x i16> [[VSRI_N]], <8 x i16> [[VSRI_N1]], i32 15)
+// CHECK:   ret <8 x i16> [[VSRI_N2]]
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsriq_n_p16
   return vsriq_n_p16(a, b, 15);
-  // CHECK: sri {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_s8
   return vsli_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_s16
   return vsli_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_s32
   return vsli_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_s8
   return vsliq_n_s8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_s16
   return vsliq_n_s16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_s32
   return vsliq_n_s32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_s64
   return vsliq_n_s64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_u8
   return vsli_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_u16
   return vsli_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.aarch64.neon.vsli.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsli_n_u32
   return vsli_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_u8
   return vsliq_n_u8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 3)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_u16
   return vsliq_n_u16(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.aarch64.neon.vsli.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], i32 3)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsliq_n_u32
   return vsliq_n_u32(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsli.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], i32 3)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsliq_n_u64
   return vsliq_n_u64(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.vsli.v8i8(<8 x i8> %a, <8 x i8> %b, i32 3)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vsli_n_p8
   return vsli_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.aarch64.neon.vsli.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], i32 15)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vsli_n_p16
   return vsli_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.vsli.v16i8(<16 x i8> %a, <16 x i8> %b, i32 3)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsliq_n_p8
   return vsliq_n_p8(a, b, 3);
-  // CHECK: sli {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.aarch64.neon.vsli.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], i32 15)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsliq_n_p16
   return vsliq_n_p16(a, b, 15);
-  // CHECK: sli {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %a, <8 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 int8x8_t test_vqshlu_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s8
   return vqshlu_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
 int16x4_t test_vqshlu_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s16
   return vqshlu_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 3, i32 3>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
 int32x2_t test_vqshlu_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshlu_n_s32
   return vqshlu_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %a, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 int8x16_t test_vqshluq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s8
   return vqshluq_n_s8(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
 int16x8_t test_vqshluq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s16
   return vqshluq_n_s16(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
 int32x4_t test_vqshluq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s32
   return vqshluq_n_s32(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 3, i64 3>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
 int64x2_t test_vqshluq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshluq_n_s64
   return vqshluq_n_s64(a, 3);
-  // CHECK: sqshlu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_s16
   return vshrn_n_s16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_s32
   return vshrn_n_s32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_s64
   return vshrn_n_s64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshrn_n_u16
   return vshrn_n_u16(a, 3);
-  // CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshrn_n_u32
   return vshrn_n_u32(a, 9);
-  // CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vshrn_n_u64
   return vshrn_n_u64(a, 19);
-  // CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s16
   return vshrn_high_n_s16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s32
   return vshrn_high_n_s32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_s64
   return vshrn_high_n_s64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VSHRN_N]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u16
   return vshrn_high_n_u16(a, b, 3);
-  // CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VSHRN_N]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u32
   return vshrn_high_n_u32(a, b, 9);
-  // CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 19, i64 19>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VSHRN_N]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vshrn_high_n_u64
   return vshrn_high_n_u64(a, b, 19);
-  // CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
 int8x8_t test_vqshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s16
   return vqshrun_n_s16(a, 3);
-  // CHECK: sqshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
 int16x4_t test_vqshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s32
   return vqshrun_n_s32(a, 9);
-  // CHECK: sqshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
 int32x2_t test_vqshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrun_n_s64
   return vqshrun_n_s64(a, 19);
-  // CHECK: sqshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[VQSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s16
   return vqshrun_high_n_s16(a, b, 3);
-  // CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[VQSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s32
   return vqshrun_high_n_s32(a, b, 9);
-  // CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrun.v2i32(<2 x i64> [[VQSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrun_high_n_s64
   return vqshrun_high_n_s64(a, b, 19);
-  // CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s16
   return vrshrn_n_s16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s32
   return vrshrn_n_s32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_s64
   return vrshrn_n_s64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u16
   return vrshrn_n_u16(a, 3);
-  // CHECK: rshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u32
   return vrshrn_n_u32(a, 9);
-  // CHECK: rshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vrshrn_n_u64
   return vrshrn_n_u64(a, 19);
-  // CHECK: rshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s16
   return vrshrn_high_n_s16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s32
   return vrshrn_high_n_s32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_s64
   return vrshrn_high_n_s64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> [[VRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u16
   return vrshrn_high_n_u16(a, b, 3);
-  // CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.rshrn.v4i16(<4 x i32> [[VRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u32
   return vrshrn_high_n_u32(a, b, 9);
-  // CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.rshrn.v2i32(<2 x i64> [[VRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrshrn_high_n_u64
   return vrshrn_high_n_u64(a, b, 19);
-  // CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
 int8x8_t test_vqrshrun_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s16
   return vqrshrun_n_s16(a, 3);
-  // CHECK: sqrshrun {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
 int16x4_t test_vqrshrun_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s32
   return vqrshrun_n_s32(a, 9);
-  // CHECK: sqrshrun {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
 int32x2_t test_vqrshrun_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrun_n_s64
   return vqrshrun_n_s64(a, 19);
-  // CHECK: sqrshrun {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[VQRSHRUN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRUN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrun_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s16
   return vqrshrun_high_n_s16(a, b, 3);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[VQRSHRUN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRUN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrun_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s32
   return vqrshrun_high_n_s32(a, b, 9);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrun.v2i32(<2 x i64> [[VQRSHRUN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRUN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrun_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrun_high_n_s64
   return vqrshrun_high_n_s64(a, b, 19);
-  // CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s16
   return vqshrn_n_s16(a, 3);
-  // CHECK: sqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s32
   return vqshrn_n_s32(a, 9);
-  // CHECK: sqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_s64
   return vqshrn_n_s64(a, 19);
-  // CHECK: sqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u16
   return vqshrn_n_u16(a, 3);
-  // CHECK: uqshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u32
   return vqshrn_n_u32(a, 9);
-  // CHECK: uqshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshrn_n_u64
   return vqshrn_n_u64(a, 19);
-  // CHECK: uqshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s16
   return vqshrn_high_n_s16(a, b, 3);
-  // CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s32
   return vqshrn_high_n_s32(a, b, 9);
-  // CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_s64
   return vqshrn_high_n_s64(a, b, 19);
-  // CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[VQSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u16
   return vqshrn_high_n_u16(a, b, 3);
-  // CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[VQSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u32
   return vqshrn_high_n_u32(a, b, 9);
-  // CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshrn.v2i32(<2 x i64> [[VQSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqshrn_high_n_u64
   return vqshrn_high_n_u64(a, b, 19);
-  // CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s16
   return vqrshrn_n_s16(a, 3);
-  // CHECK: sqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s32
   return vqrshrn_n_s32(a, 9);
-  // CHECK: sqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_s64
   return vqrshrn_n_s64(a, 19);
-  // CHECK: sqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u16
   return vqrshrn_n_u16(a, 3);
-  // CHECK: uqrshrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u32
   return vqrshrn_n_u32(a, 9);
-  // CHECK: uqrshrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqrshrn_n_u64
   return vqrshrn_n_u64(a, 19);
-  // CHECK: uqrshrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vqrshrn_high_n_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s16
   return vqrshrn_high_n_s16(a, b, 3);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vqrshrn_high_n_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s32
   return vqrshrn_high_n_s32(a, b, 9);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vqrshrn_high_n_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_s64
   return vqrshrn_high_n_s64(a, b, 19);
-  // CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[VQRSHRN_N]], i32 3)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQRSHRN_N1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u16
   return vqrshrn_high_n_u16(a, b, 3);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[VQRSHRN_N]], i32 9)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VQRSHRN_N1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u32
   return vqrshrn_high_n_u32(a, b, 9);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqrshrn.v2i32(<2 x i64> [[VQRSHRN_N]], i32 19)
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VQRSHRN_N1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqrshrn_high_n_u64
   return vqrshrn_high_n_u64(a, b, 19);
-  // CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-// CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 3);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-// CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 9);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-// CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 19);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 3);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 9);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 19);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 3);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 9);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 19);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 3);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #3
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 9, i32 9, i32 9, i32 9>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 9);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #9
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 19, i64 19>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 19);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #19
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
-// CHECK-LABEL: test_vmovl_s8
   return vmovl_s8(a);
-// CHECK: sshll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
-// CHECK-LABEL: test_vmovl_s16
   return vmovl_s16(a);
-// CHECK: sshll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
-// CHECK-LABEL: test_vmovl_s32
   return vmovl_s32(a);
-// CHECK: sshll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
-// CHECK-LABEL: test_vmovl_u8
   return vmovl_u8(a);
-// CHECK: ushll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
-// CHECK-LABEL: test_vmovl_u16
   return vmovl_u16(a);
-// CHECK: ushll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vmovl_u32
   return vmovl_u32(a);
-// CHECK: ushll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vmovl_high_s8(int8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_s8
   return vmovl_high_s8(a);
-// CHECK: sshll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vmovl_high_s16(int16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_s16
   return vmovl_high_s16(a);
-// CHECK: sshll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vmovl_high_s32(int32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_s32
   return vmovl_high_s32(a);
-// CHECK: sshll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovl_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vmovl_high_u8(uint8x16_t a) {
-// CHECK-LABEL: test_vmovl_high_u8
   return vmovl_high_u8(a);
-// CHECK: ushll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovl_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vmovl_high_u16(uint16x8_t a) {
-// CHECK-LABEL: test_vmovl_high_u16
   return vmovl_high_u16(a);
-// CHECK: ushll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovl_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vmovl_high_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vmovl_high_u32
   return vmovl_high_u32(a);
-// CHECK: ushll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_s32
   return vcvt_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_s32
   return vcvtq_n_f32_s32(a, 31);
-  // CHECK: scvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N1]]
 float64x2_t test_vcvtq_n_f64_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_s64
   return vcvtq_n_f64_s64(a, 50);
-  // CHECK: scvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.aarch64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_f32_u32
   return vcvt_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f32_u32
   return vcvtq_n_f32_u32(a, 31);
-  // CHECK: ucvtf {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_n_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x double> @llvm.aarch64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x double> [[VCVT_N1]]
 float64x2_t test_vcvtq_n_f64_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_f64_u64
   return vcvtq_n_f64_u64(a, 50);
-  // CHECK: ucvtf {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_s32_f32
   return vcvt_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s32_f32
   return vcvtq_n_s32_f32(a, 31);
-  // CHECK: fcvtzs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N1]]
 int64x2_t test_vcvtq_n_s64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_s64_f64
   return vcvtq_n_s64_f64(a, 50);
-  // CHECK: fcvtzs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvt_n_u32_f32
   return vcvt_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 31)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u32_f32
   return vcvtq_n_u32_f32(a, 31);
-  // CHECK: fcvtzu {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_n_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> [[VCVT_N]], i32 50)
+// CHECK:   ret <2 x i64> [[VCVT_N1]]
 uint64x2_t test_vcvtq_n_u64_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vcvtq_n_u64_f64
   return vcvtq_n_u64_f64(a, 50);
-  // CHECK: fcvtzu {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #50
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddl_s8
   return vaddl_s8(a, b);
-  // CHECK: saddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddl_s16
   return vaddl_s16(a, b);
-  // CHECK: saddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddl_s32
   return vaddl_s32(a, b);
-  // CHECK: saddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddl_u8
   return vaddl_u8(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddl_u16
   return vaddl_u16(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddl_u32
   return vaddl_u32(a, b);
-  // CHECK: uaddl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_s8
   return vaddl_high_s8(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_s16
   return vaddl_high_s16(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_s32
   return vaddl_high_s32(a, b);
-  // CHECK: saddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddl_high_u8
   return vaddl_high_u8(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddl_high_u16
   return vaddl_high_u16(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddl_high_u32
   return vaddl_high_u32(a, b);
-  // CHECK: uaddl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vaddw_s8
   return vaddw_s8(a, b);
-  // CHECK: saddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vaddw_s16
   return vaddw_s16(a, b);
-  // CHECK: saddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vaddw_s32
   return vaddw_s32(a, b);
-  // CHECK: saddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vaddw_u8
   return vaddw_u8(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vaddw_u16
   return vaddw_u16(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vaddw_u32
   return vaddw_u32(a, b);
-  // CHECK: uaddw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_s8
   return vaddw_high_s8(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_s16
   return vaddw_high_s16(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_s32
   return vaddw_high_s32(a, b);
-  // CHECK: saddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vaddw_high_u8
   return vaddw_high_u8(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddw_high_u16
   return vaddw_high_u16(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vaddw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddw_high_u32
   return vaddw_high_u32(a, b);
-  // CHECK: uaddw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubl_s8
   return vsubl_s8(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubl_s16
   return vsubl_s16(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubl_s32
   return vsubl_s32(a, b);
-  // CHECK: ssubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubl_u8
   return vsubl_u8(a, b);
-  // CHECK: usubl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubl_u16
   return vsubl_u16(a, b);
-  // CHECK: usubl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubl_u32
   return vsubl_u32(a, b);
-  // CHECK: usubl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_s8
   return vsubl_high_s8(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = sext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_s16
   return vsubl_high_s16(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_s32
   return vsubl_high_s32(a, b);
-  // CHECK: ssubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP1:%.*]] = zext <8 x i8> [[SHUFFLE_I_I10_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[TMP0]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubl_high_u8
   return vsubl_high_u8(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP5]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubl_high_u16
   return vsubl_high_u16(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SHUFFLE_I_I10_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I10_I]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP5]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubl_high_u32
   return vsubl_high_u32(a, b);
-  // CHECK: usubl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsubw_s8
   return vsubw_s8(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsubw_s16
   return vsubw_s16(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsubw_s32
   return vsubw_s32(a, b);
-  // CHECK: ssubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vsubw_u8
   return vsubw_u8(a, b);
-  // CHECK: usubw {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vsubw_u16
   return vsubw_u16(a, b);
-  // CHECK: usubw {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vsubw_u32
   return vsubw_u32(a, b);
-  // CHECK: usubw {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_high_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_high_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_s8
   return vsubw_high_s8(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_high_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_high_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_s16
   return vsubw_high_s16(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_high_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_high_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_s32
   return vsubw_high_s32(a, b);
-  // CHECK: ssubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubw_high_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I_I_I]] to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[TMP0]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_high_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsubw_high_u8
   return vsubw_high_u8(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubw_high_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_high_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubw_high_u16
   return vsubw_high_u16(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsubw_high_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[TMP2]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_high_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubw_high_u32
   return vsubw_high_u32(a, b);
-  // CHECK: usubw2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_s16
   return vaddhn_s16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_s32
   return vaddhn_s32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_s64
   return vaddhn_s64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_u16
   return vaddhn_u16(a, b);
-  // CHECK: addhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_u32
   return vaddhn_u32(a, b);
-  // CHECK: addhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_u64
   return vaddhn_u64(a, b);
-  // CHECK: addhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vaddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s16
   return vaddhn_high_s16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vaddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s32
   return vaddhn_high_s32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vaddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_s64
   return vaddhn_high_s64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vaddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <8 x i16> [[VADDHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <8 x i16> [[VADDHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VADDHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vaddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u16
   return vaddhn_high_u16(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vaddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <4 x i32> [[VADDHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <4 x i32> [[VADDHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VADDHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vaddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u32
   return vaddhn_high_u32(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vaddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I_I:%.*]] = lshr <2 x i64> [[VADDHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I_I:%.*]] = trunc <2 x i64> [[VADDHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VADDHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vaddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vaddhn_high_u64
   return vaddhn_high_u64(r, a, b);
-  // CHECK: addhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_s16
   return vraddhn_s16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_s32
   return vraddhn_s32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_s64
   return vraddhn_s64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_u16
   return vraddhn_u16(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_u32
   return vraddhn_u32(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_u64
   return vraddhn_u64(a, b);
-  // CHECK: raddhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vraddhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s16
   return vraddhn_high_s16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vraddhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s32
   return vraddhn_high_s32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vraddhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_s64
   return vraddhn_high_s64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vraddhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> [[VRADDHN_V_I_I]], <8 x i16> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRADDHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vraddhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u16
   return vraddhn_high_u16(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vraddhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> [[VRADDHN_V_I_I]], <4 x i32> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vraddhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u32
   return vraddhn_high_u32(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vraddhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> [[VRADDHN_V_I_I]], <2 x i64> [[VRADDHN_V1_I_I]]) #4
+// CHECK:   [[VRADDHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vraddhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vraddhn_high_u64
   return vraddhn_high_u64(r, a, b);
-  // CHECK: raddhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_s16
   return vsubhn_s16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_s32
   return vsubhn_s32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_s64
   return vsubhn_s64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_u16
   return vsubhn_u16(a, b);
-  // CHECK: subhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_u32
   return vsubhn_u32(a, b);
-  // CHECK: subhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_u64
   return vsubhn_u64(a, b);
-  // CHECK: subhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s16
   return vsubhn_high_s16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s32
   return vsubhn_high_s32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_s64
   return vsubhn_high_s64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <8 x i16> [[VSUBHN_I_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I_I]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VSUBHN2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u16
   return vsubhn_high_u16(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <4 x i32> [[VSUBHN_I_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[VSUBHN2_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u32
   return vsubhn_high_u32(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I_I:%.*]] = lshr <2 x i64> [[VSUBHN_I_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[VSUBHN2_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsubhn_high_u64
   return vsubhn_high_u64(r, a, b);
-  // CHECK: subhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_s16
   return vrsubhn_s16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_s32
   return vrsubhn_s32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_s64
   return vrsubhn_s64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_u16
   return vrsubhn_u16(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_u32
   return vrsubhn_u32(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_u64
   return vrsubhn_u64(a, b);
-  // CHECK: rsubhn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_s16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vrsubhn_high_s16(int8x8_t r, int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s16
   return vrsubhn_high_s16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_s32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vrsubhn_high_s32(int16x4_t r, int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s32
   return vrsubhn_high_s32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_s64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vrsubhn_high_s64(int32x2_t r, int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_s64
   return vrsubhn_high_s64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrsubhn_high_u16(<8 x i8> %r, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I_I]], <8 x i16> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %r, <8 x i8> [[VRSUBHN_V2_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vrsubhn_high_u16(uint8x8_t r, uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u16
   return vrsubhn_high_u16(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vrsubhn_high_u32(<4 x i16> %r, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.rsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I_I]], <4 x i32> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %r, <4 x i16> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vrsubhn_high_u32(uint16x4_t r, uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u32
   return vrsubhn_high_u32(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsubhn_high_u64(<2 x i32> %r, <2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.rsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I_I]], <2 x i64> [[VRSUBHN_V1_I_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %r, <2 x i32> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vrsubhn_high_u64(uint32x2_t r, uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vrsubhn_high_u64
   return vrsubhn_high_u64(r, a, b);
-  // CHECK: rsubhn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vabdl_s8
   return vabdl_s8(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vabdl_s16
   return vabdl_s16(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vabdl_s32
   return vabdl_s32(a, b);
-  // CHECK: sabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vabdl_u8
   return vabdl_u8(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I]], <4 x i16> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vabdl_u16
   return vabdl_u16(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I]], <2 x i32> [[VABD1_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vabdl_u32
   return vabdl_u32(a, b);
-  // CHECK: uabdl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vabal_s8
   return vabal_s8(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vabal_s16
   return vabal_s16(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vabal_s32
   return vabal_s32(a, b, c);
-  // CHECK: sabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vabal_u8
   return vabal_u8(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vabal_u16
   return vabal_u16(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vabal_u32
   return vabal_u32(a, b, c);
-  // CHECK: uabal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabdl_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 int16x8_t test_vabdl_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_s8
   return vabdl_high_s8(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 int32x4_t test_vabdl_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_s16
   return vabdl_high_s16(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 int64x2_t test_vabdl_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_s32
   return vabdl_high_s32(a, b);
-  // CHECK: sabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabdl_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I_I]]
 uint16x8_t test_vabdl_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vabdl_high_u8
   return vabdl_high_u8(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabdl_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I]], <4 x i16> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I_I]]
 uint32x4_t test_vabdl_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vabdl_high_u16
   return vabdl_high_u16(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabdl_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I]], <2 x i32> [[VABD1_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I_I]]
 uint64x2_t test_vabdl_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vabdl_high_u32
   return vabdl_high_u32(a, b);
-  // CHECK: uabdl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vabal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_s8
   return vabal_high_s8(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vabal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_s16
   return vabal_high_s16(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vabal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_s32
   return vabal_high_s32(a, b, c);
-  // CHECK: sabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vabal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uabd.v8i8(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <8 x i8> [[VABD_I_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vabal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vabal_high_u8
   return vabal_high_u8(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vabal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uabd.v4i16(<4 x i16> [[VABD_I_I_I_I]], <4 x i16> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vabal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vabal_high_u16
   return vabal_high_u16(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vabal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VABD_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD1_I_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD2_I_I_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uabd.v2i32(<2 x i32> [[VABD_I_I_I_I]], <2 x i32> [[VABD1_I_I_I_I]]) #4
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VABD2_I_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vabal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vabal_high_u32
   return vabal_high_u32(a, b, c);
-  // CHECK: uabal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vmull_s8
   return vmull_s8(a, b);
-  // CHECK: smull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vmull_s16
   return vmull_s16(a, b);
-  // CHECK: smull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vmull_s32
   return vmull_s32(a, b);
-  // CHECK: smull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vmull_u8
   return vmull_u8(a, b);
-  // CHECK: umull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vmull_u16
   return vmull_u16(a, b);
-  // CHECK: umull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vmull_u32
   return vmull_u32(a, b);
-  // CHECK: umull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 int16x8_t test_vmull_high_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_s8
   return vmull_high_s8(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 int32x4_t test_vmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_s16
   return vmull_high_s16(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 int64x2_t test_vmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_s32
   return vmull_high_s32(a, b);
-  // CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 uint16x8_t test_vmull_high_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_u8
   return vmull_high_u8(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmull_high_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I_I]]
 uint32x4_t test_vmull_high_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vmull_high_u16
   return vmull_high_u16(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmull_high_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I_I]]
 uint64x2_t test_vmull_high_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vmull_high_u32
   return vmull_high_u32(a, b);
-  // CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlal_s8
   return vmlal_s8(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlal_s16
   return vmlal_s16(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlal_s32
   return vmlal_s32(a, b, c);
-  // CHECK: smlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlal_u8
   return vmlal_u8(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlal_u16
   return vmlal_u16(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlal_u32
   return vmlal_u32(a, b, c);
-  // CHECK: umlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlal_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 int16x8_t test_vmlal_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_s8
   return vmlal_high_s8(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 int32x4_t test_vmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_s16
   return vmlal_high_s16(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 int64x2_t test_vmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_s32
   return vmlal_high_s32(a, b, c);
-  // CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlal_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I_I]]
 uint16x8_t test_vmlal_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlal_high_u8
   return vmlal_high_u8(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlal_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I_I]]
 uint32x4_t test_vmlal_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlal_high_u16
   return vmlal_high_u16(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlal_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[ADD_I_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I_I]]
 uint64x2_t test_vmlal_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlal_high_u32
   return vmlal_high_u32(a, b, c);
-  // CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_s8
   return vmlsl_s8(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_s16
   return vmlsl_s16(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_s32
   return vmlsl_s32(a, b, c);
-  // CHECK: smlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vmlsl_u8
   return vmlsl_u8(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
-  // CHECK-LABEL: test_vmlsl_u16
   return vmlsl_u16(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
-  // CHECK-LABEL: test_vmlsl_u32
   return vmlsl_u32(a, b, c);
-  // CHECK: umlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_s8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 int16x8_t test_vmlsl_high_s8(int16x8_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s8
   return vmlsl_high_s8(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 int32x4_t test_vmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s16
   return vmlsl_high_s16(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 int64x2_t test_vmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_s32
   return vmlsl_high_s32(a, b, c);
-  // CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_high_u8(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %c, <16 x i8> %c, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I_I]]
 uint16x8_t test_vmlsl_high_u8(uint16x8_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u8
   return vmlsl_high_u8(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_high_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[VMULL_I_I_I]], <4 x i16> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I_I]]
 uint32x4_t test_vmlsl_high_u16(uint32x4_t a, uint16x8_t b, uint16x8_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u16
   return vmlsl_high_u16(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_high_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[VMULL_I_I_I]], <2 x i32> [[VMULL1_I_I_I]]) #4
+// CHECK:   [[SUB_I_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I_I]]
 uint64x2_t test_vmlsl_high_u32(uint64x2_t a, uint32x4_t b, uint32x4_t c) {
-  // CHECK-LABEL: test_vmlsl_high_u32
   return vmlsl_high_u32(a, b, c);
-  // CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vqdmull_s16
   return vqdmull_s16(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vqdmull_s32
   return vqdmull_s32(a, b);
-  // CHECK: sqdmull {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_s16
   return vqdmlal_s16(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlal_s32
   return vqdmlal_s32(a, b, c);
-  // CHECK: sqdmlal {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s16
   return vqdmlsl_s16(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
-  // CHECK-LABEL: test_vqdmlsl_s32
   return vqdmlsl_s32(a, b, c);
-  // CHECK: sqdmlsl {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_high_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMULL_V_I_I]], <4 x i16> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_high_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s16
   return vqdmull_high_s16(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_high_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMULL_V_I_I]], <2 x i32> [[VQDMULL_V1_I_I]]) #4
+// CHECK:   [[VQDMULL_V3_I_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_high_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqdmull_high_s32
   return vqdmull_high_s32(a, b);
-  // CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[VQDMLAL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I_I]]
 int32x4_t test_vqdmlal_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s16
   return vqdmlal_high_s16(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLAL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[VQDMLAL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I_I]]
 int64x2_t test_vqdmlal_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlal_high_s32
   return vqdmlal_high_s32(a, b, c);
-  // CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_high_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i16> %b, <8 x i16> %b, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <8 x i16> %c, <8 x i16> %c, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[VQDMLAL_I_I]], <4 x i16> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[VQDMLSL_V_I_I]], <4 x i32> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I_I]]
 int32x4_t test_vqdmlsl_high_s16(int32x4_t a, int16x8_t b, int16x8_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s16
   return vqdmlsl_high_s16(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_high_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i32> %b, <4 x i32> %b, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <4 x i32> %c, <4 x i32> %c, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I7_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[VQDMLAL_I_I]], <2 x i32> [[VQDMLAL1_I_I]]) #4
+// CHECK:   [[VQDMLSL_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[VQDMLSL_V_I_I]], <2 x i64> [[VQDMLAL2_I_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I_I]]
 int64x2_t test_vqdmlsl_high_s32(int64x2_t a, int32x4_t b, int32x4_t c) {
-  // CHECK-LABEL: test_vqdmlsl_high_s32
   return vqdmlsl_high_s32(a, b, c);
-  // CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vmull_p8
   return vmull_p8(a, b);
-  // CHECK: pmull {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmull_high_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <16 x i8> %b, <16 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.pmull.v8i16(<8 x i8> [[SHUFFLE_I_I]], <8 x i8> [[SHUFFLE_I7_I]]) #4
+// CHECK:   ret <8 x i16> [[VMULL_I_I]]
 poly16x8_t test_vmull_high_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vmull_high_p8
   return vmull_high_p8(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i64 @test_vaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 int64_t test_vaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vaddd_s64
   return vaddd_s64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VADDD_I:%.*]] = add i64 %a, %b
+// CHECK:   ret i64 [[VADDD_I]]
 uint64_t test_vaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vaddd_u64
   return vaddd_u64(a, b);
-// CHECK: add {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 int64_t test_vsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsubd_s64
   return vsubd_s64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSUBD_I:%.*]] = sub i64 %a, %b
+// CHECK:   ret i64 [[VSUBD_I]]
 uint64_t test_vsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsubd_u64
   return vsubd_u64(a, b);
-// CHECK: sub {{[xd][0-9]+}}, {{[xd][0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqaddb_s8
   return vqaddb_s8(a, b);
-// CHECK: sqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqaddh_s16
   return vqaddh_s16(a, b);
-// CHECK: sqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_S32_I]]
 int32_t test_vqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqadds_s32
   return vqadds_s32(a, b);
-// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_S64_I]]
 int64_t test_vqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqaddd_s64
   return vqaddd_s64(a, b);
-// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqaddb_u8
   return vqaddb_u8(a, b);
-// CHECK: uqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqaddh_u16
   return vqaddh_u16(a, b);
-// CHECK: uqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQADDS_U32_I]]
 uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqadds_u32
   return vqadds_u32(a, b);
-// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQADDD_U64_I]]
 uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqaddd_u64
   return vqaddd_u64(a, b);
-// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqsubb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqsubb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vqsubb_s8
   return vqsubb_s8(a, b);
-// CHECK: sqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqsubh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqsubh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqsubh_s16
   return vqsubh_s16(a, b);
-// CHECK: sqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqsubs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSUBS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_S32_I]]
 int32_t test_vqsubs_s32(int32_t a, int32_t b) {
-  // CHECK-LABEL: test_vqsubs_s32
   return vqsubs_s32(a, b);
-// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqsubd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSUBD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_S64_I]]
 int64_t test_vqsubd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vqsubd_s64
   return vqsubd_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i8 @test_vqsubb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSUBB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqsub.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSUBB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vqsubb_u8
   return vqsubb_u8(a, b);
-// CHECK: uqsub {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
+// CHECK-LABEL: define i16 @test_vqsubh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSUBH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqsub.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSUBH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vqsubh_u16
   return vqsubh_u16(a, b);
-// CHECK: uqsub {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqsubs_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSUBS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqsub.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSUBS_U32_I]]
 uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vqsubs_u32
   return vqsubs_u32(a, b);
-// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vqsubd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSUBD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqsub.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSUBD_U64_I]]
 uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vqsubd_u64
   return vqsubd_u64(a, b);
-// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_S64_I]]
 int64_t test_vshld_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vshld_s64
   return vshld_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSHLD_U64_I]]
 uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vshld_u64
   return vshld_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_s8
+// CHECK-LABEL: define i8 @test_vqshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqshlb_s8(int8_t a, int8_t b) {
   return vqshlb_s8(a, b);
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_s16
+// CHECK-LABEL: define i16 @test_vqshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqshlh_s16(int16_t a, int16_t b) {
   return vqshlh_s16(a, b);
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_s32
+// CHECK-LABEL: define i32 @test_vqshls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_S32_I]]
 int32_t test_vqshls_s32(int32_t a, int32_t b) {
   return vqshls_s32(a, b);
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_s64
+// CHECK-LABEL: define i64 @test_vqshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_S64_I]]
 int64_t test_vqshld_s64(int64_t a, int64_t b) {
   return vqshld_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshlb_u8
+// CHECK-LABEL: define i8 @test_vqshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
   return vqshlb_u8(a, b);
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqshlh_u16
+// CHECK-LABEL: define i16 @test_vqshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
   return vqshlh_u16(a, b);
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqshls_u32
+// CHECK-LABEL: define i32 @test_vqshls_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQSHLS_U32_I]]
 uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
   return vqshls_u32(a, b);
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqshld_u64
+// CHECK-LABEL: define i64 @test_vqshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQSHLD_U64_I]]
 uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
   return vqshld_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vrshld_s64
+// CHECK-LABEL: define i64 @test_vrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_S64_I]]
 int64_t test_vrshld_s64(int64_t a, int64_t b) {
   return vrshld_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vrshld_u64
+// CHECK-LABEL: define i64 @test_vrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VRSHLD_U64_I]]
 uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
   return vrshld_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_s8
+// CHECK-LABEL: define i8 @test_vqrshlb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
   return vqrshlb_s8(a, b);
-// CHECK: sqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_s16
+// CHECK-LABEL: define i16 @test_vqrshlh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
   return vqrshlh_s16(a, b);
-// CHECK: sqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_s32
+// CHECK-LABEL: define i32 @test_vqrshls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRSHLS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_S32_I]]
 int32_t test_vqrshls_s32(int32_t a, int32_t b) {
   return vqrshls_s32(a, b);
-// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_s64
+// CHECK-LABEL: define i64 @test_vqrshld_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQRSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_S64_I]]
 int64_t test_vqrshld_s64(int64_t a, int64_t b) {
   return vqrshld_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshlb_u8
+// CHECK-LABEL: define i8 @test_vqrshlb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VQRSHLB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VQRSHLB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
   return vqrshlb_u8(a, b);
-// CHECK: uqrshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
 }
 
-// CHECK-LABEL: test_vqrshlh_u16
+// CHECK-LABEL: define i16 @test_vqrshlh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRSHLH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRSHLH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
   return vqrshlh_u16(a, b);
-// CHECK: uqrshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
-// CHECK-LABEL: test_vqrshls_u32
+// CHECK-LABEL: define i32 @test_vqrshls_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRSHLS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uqrshl.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRSHLS_U32_I]]
 uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
   return vqrshls_u32(a, b);
-// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
-// CHECK-LABEL: test_vqrshld_u64
+// CHECK-LABEL: define i64 @test_vqrshld_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VQRSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uqrshl.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VQRSHLD_U64_I]]
 uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
   return vqrshld_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vpaddd_s64
+// CHECK-LABEL: define i64 @test_vpaddd_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VPADDD_S64_I]]
 int64_t test_vpaddd_s64(int64x2_t a) {
   return vpaddd_s64(a);
-// CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpadds_f32
+// CHECK-LABEL: define float @test_vpadds_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x float> [[TMP1]], i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd float [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret float [[VPADDD_I]]
 float32_t test_vpadds_f32(float32x2_t a) {
   return vpadds_f32(a);
-// CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpaddd_f64
+// CHECK-LABEL: define double @test_vpaddd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[LANE0_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+// CHECK:   [[LANE1_I:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+// CHECK:   [[VPADDD_I:%.*]] = fadd double [[LANE0_I]], [[LANE1_I]]
+// CHECK:   ret double [[VPADDD_I]]
 float64_t test_vpaddd_f64(float64x2_t a) {
   return vpaddd_f64(a);
-// CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxnms_f32
+// CHECK-LABEL: define float @test_vpmaxnms_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMAXNMS_F32_I]]
 float32_t test_vpmaxnms_f32(float32x2_t a) {
   return vpmaxnms_f32(a);
-// CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxnmqd_f64
+// CHECK-LABEL: define double @test_vpmaxnmqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMAXNMQD_F64_I]]
 float64_t test_vpmaxnmqd_f64(float64x2_t a) {
   return vpmaxnmqd_f64(a);
-// CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmaxs_f32
+// CHECK-LABEL: define float @test_vpmaxs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMAXS_F32_I]]
 float32_t test_vpmaxs_f32(float32x2_t a) {
   return vpmaxs_f32(a);
-// CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpmaxqd_f64
+// CHECK-LABEL: define double @test_vpmaxqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMAXQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMAXQD_F64_I]]
 float64_t test_vpmaxqd_f64(float64x2_t a) {
   return vpmaxqd_f64(a);
-// CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpminnms_f32
+// CHECK-LABEL: define float @test_vpminnms_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINNMS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMINNMS_F32_I]]
 float32_t test_vpminnms_f32(float32x2_t a) {
   return vpminnms_f32(a);
-// CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminnmqd_f64
+// CHECK-LABEL: define double @test_vpminnmqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINNMQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMINNMQD_F64_I]]
 float64_t test_vpminnmqd_f64(float64x2_t a) {
   return vpminnmqd_f64(a);
-// CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vpmins_f32
+// CHECK-LABEL: define float @test_vpmins_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMINS_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VPMINS_F32_I]]
 float32_t test_vpmins_f32(float32x2_t a) {
   return vpmins_f32(a);
-// CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
-// CHECK-LABEL: test_vpminqd_f64
+// CHECK-LABEL: define double @test_vpminqd_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VPMINQD_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VPMINQD_F64_I]]
 float64_t test_vpminqd_f64(float64x2_t a) {
   return vpminqd_f64(a);
-// CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmulhh_s16
   return vqdmulhh_s16(a, b);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulhs_s32
   return vqdmulhs_s32(a, b);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define i16 @test_vqrdmulhh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vqrdmulhh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqrdmulhh_s16
   return vqrdmulhh_s16(a, b);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
 }
 
+// CHECK-LABEL: define i32 @test_vqrdmulhs_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqrdmulhs_s32
   return vqrdmulhs_s32(a, b);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmulxs_f32(float %a, float %b) #0 {
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float %b) #4
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vmulxs_f32
   return vmulxs_f32(a, b);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vmulxd_f64(double %a, double %b) #0 {
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double %b) #4
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vmulxd_f64
   return vmulxd_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmulx_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMULX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMULX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMULX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmulx.v1f64(<1 x double> [[VMULX_I]], <1 x double> [[VMULX1_I]]) #4
+// CHECK:   ret <1 x double> [[VMULX2_I]]
 float64x1_t test_vmulx_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulx_f64
   return vmulx_f64(a, b);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vrecpss_f32(float %a, float %b) #0 {
+// CHECK:   [[VRECPS_I:%.*]] = call float @llvm.aarch64.neon.frecps.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRECPS_I]]
 float32_t test_vrecpss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrecpss_f32
   return vrecpss_f32(a, b);
-// CHECK: frecps {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vrecpsd_f64(double %a, double %b) #0 {
+// CHECK:   [[VRECPS_I:%.*]] = call double @llvm.aarch64.neon.frecps.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRECPS_I]]
 float64_t test_vrecpsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrecpsd_f64
   return vrecpsd_f64(a, b);
-// CHECK: frecps {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vrsqrtss_f32(float %a, float %b) #0 {
+// CHECK:   [[VRSQRTSS_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrts.f32(float %a, float %b) #4
+// CHECK:   ret float [[VRSQRTSS_F32_I]]
 float32_t test_vrsqrtss_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vrsqrtss_f32
   return vrsqrtss_f32(a, b);
-// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vrsqrtsd_f64(double %a, double %b) #0 {
+// CHECK:   [[VRSQRTSD_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrts.f64(double %a, double %b) #4
+// CHECK:   ret double [[VRSQRTSD_F64_I]]
 float64_t test_vrsqrtsd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vrsqrtsd_f64
   return vrsqrtsd_f64(a, b);
-// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vcvts_f32_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_s32(a);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_f64_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{[dx][0-9]+}}
   return vcvtd_f64_s64(a);
 }
 
+// CHECK-LABEL: define float @test_vcvts_f32_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = uitofp i32 %a to float
+// CHECK:   ret float [[TMP0]]
 float32_t test_vcvts_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{[ws][0-9]+}}
   return vcvts_f32_u32(a);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_f64_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = uitofp i64 %a to double
+// CHECK:   ret double [[TMP0]]
 float64_t test_vcvtd_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{[xd][0-9]+}}
   return vcvtd_f64_u64(a);
 }
 
+// CHECK-LABEL: define float @test_vrecpes_f32(float %a) #0 {
+// CHECK:   [[VRECPES_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpe.f32(float %a) #4
+// CHECK:   ret float [[VRECPES_F32_I]]
 float32_t test_vrecpes_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpes_f32
-// CHECK: frecpe {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpes_f32(a);
 }
  
+// CHECK-LABEL: define double @test_vrecped_f64(double %a) #0 {
+// CHECK:   [[VRECPED_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpe.f64(double %a) #4
+// CHECK:   ret double [[VRECPED_F64_I]]
 float64_t test_vrecped_f64(float64_t a) {
-// CHECK-LABEL: test_vrecped_f64
-// CHECK: frecpe {{d[0-9]+}}, {{d[0-9]+}}
   return vrecped_f64(a);
 }
  
+// CHECK-LABEL: define float @test_vrecpxs_f32(float %a) #0 {
+// CHECK:   [[VRECPXS_F32_I:%.*]] = call float @llvm.aarch64.neon.frecpx.f32(float %a) #4
+// CHECK:   ret float [[VRECPXS_F32_I]]
 float32_t test_vrecpxs_f32(float32_t a) {
-// CHECK-LABEL: test_vrecpxs_f32
-// CHECK: frecpx {{s[0-9]+}}, {{s[0-9]+}}
   return vrecpxs_f32(a);
  }
  
+// CHECK-LABEL: define double @test_vrecpxd_f64(double %a) #0 {
+// CHECK:   [[VRECPXD_F64_I:%.*]] = call double @llvm.aarch64.neon.frecpx.f64(double %a) #4
+// CHECK:   ret double [[VRECPXD_F64_I]]
 float64_t test_vrecpxd_f64(float64_t a) {
-// CHECK-LABEL: test_vrecpxd_f64
-// CHECK: frecpx {{d[0-9]+}}, {{d[0-9]+}}
   return vrecpxd_f64(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: ursqrte {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
   return vrsqrte_u32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: ursqrte {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
   return vrsqrteq_u32(a);
 }
 
+// CHECK-LABEL: define float @test_vrsqrtes_f32(float %a) #0 {
+// CHECK:   [[VRSQRTES_F32_I:%.*]] = call float @llvm.aarch64.neon.frsqrte.f32(float %a) #4
+// CHECK:   ret float [[VRSQRTES_F32_I]]
 float32_t test_vrsqrtes_f32(float32_t a) {
-// CHECK: vrsqrtes_f32
-// CHECK: frsqrte {{s[0-9]+}}, {{s[0-9]+}}
   return vrsqrtes_f32(a);
 }
 
+// CHECK-LABEL: define double @test_vrsqrted_f64(double %a) #0 {
+// CHECK:   [[VRSQRTED_F64_I:%.*]] = call double @llvm.aarch64.neon.frsqrte.f64(double %a) #4
+// CHECK:   ret double [[VRSQRTED_F64_I]]
 float64_t test_vrsqrted_f64(float64_t a) {
-// CHECK: vrsqrted_f64
-// CHECK: frsqrte {{d[0-9]+}}, {{d[0-9]+}}
   return vrsqrted_f64(a);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 uint8x16_t test_vld1q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8
   return vld1q_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vld1q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16
   return vld1q_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vld1q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32
   return vld1q_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vld1q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64
   return vld1q_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 int8x16_t test_vld1q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8
   return vld1q_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vld1q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16
   return vld1q_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]]
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vld1q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32
   return vld1q_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vld1q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64
   return vld1q_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP3]]
 float16x8_t test_vld1q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16
   return vld1q_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]]
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vld1q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32
   return vld1q_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.4s }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]]
+// CHECK:   ret <2 x double> [[TMP2]]
 float64x2_t test_vld1q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64
   return vld1q_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.2d }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]]
+// CHECK:   ret <16 x i8> [[TMP1]]
 poly8x16_t test_vld1q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8
   return vld1q_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.16b }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 poly16x8_t test_vld1q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16
   return vld1q_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.8h }|ldr q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 uint8x8_t test_vld1_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8
   return vld1_u8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vld1_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16
   return vld1_u16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vld1_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32
   return vld1_u32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vld1_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64
   return vld1_u64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 int8x8_t test_vld1_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8
   return vld1_s8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vld1_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16
   return vld1_s16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]]
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vld1_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32
   return vld1_s32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vld1_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64
   return vld1_s64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP3]]
 float16x4_t test_vld1_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16
   return vld1_f16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]]
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vld1_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32
   return vld1_f32(a);
-  // CHECK: {{ld1 { v[0-9]+.2s }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP2:%.*]] = load <1 x double>, <1 x double>* [[TMP1]]
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vld1_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64
   return vld1_f64(a);
-  // CHECK: {{ld1 { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]]
+// CHECK:   ret <8 x i8> [[TMP1]]
 poly8x8_t test_vld1_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8
   return vld1_p8(a);
-  // CHECK: {{ld1 { v[0-9]+.8b }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 poly16x4_t test_vld1_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16
   return vld1_p16(a);
-  // CHECK: {{ld1 { v[0-9]+.4h }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP5]]
 uint8x16x2_t test_vld2q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2q_u8
   return vld2q_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2q_u16
   return vld2q_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2q_u32
   return vld2q_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2q_u64
   return vld2q_u64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP5]]
 int8x16x2_t test_vld2q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2q_s8
   return vld2q_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2q_s16
   return vld2q_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2q_s32
   return vld2q_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2q_s64
   return vld2q_s64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2q_f16
   return vld2q_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2q_f32
   return vld2q_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2q_f64
   return vld2q_f64(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP5]]
 poly8x16x2_t test_vld2q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2q_p8
   return vld2q_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2q_p16
   return vld2q_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP5]]
 uint8x8x2_t test_vld2_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld2_u8
   return vld2_u8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld2_u16
   return vld2_u16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld2_u32
   return vld2_u32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld2_u64
   return vld2_u64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP5]]
 int8x8x2_t test_vld2_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld2_s8
   return vld2_s8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld2_s16
   return vld2_s16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld2_s32
   return vld2_s32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld2_s64
   return vld2_s64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld2_f16
   return vld2_f16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld2_f32
   return vld2_f32(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld2_f64
   return vld2_f64(a);
-  // CHECK: {{ld1|ld2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP5]]
 poly8x8x2_t test_vld2_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld2_p8
   return vld2_p8(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld2_p16
   return vld2_p16(a);
-  // CHECK: ld2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP5]]
 uint8x16x3_t test_vld3q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3q_u8
   return vld3q_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3q_u16
   return vld3q_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3q_u32
   return vld3q_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3q_u64
   return vld3q_u64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP5]]
 int8x16x3_t test_vld3q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3q_s8
   return vld3q_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3q_s16
   return vld3q_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3q_s32
   return vld3q_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3q_s64
   return vld3q_s64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3q_f16
   return vld3q_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3q_f32
   return vld3q_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3q_f64
   return vld3q_f64(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP5]]
 poly8x16x3_t test_vld3q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3q_p8
   return vld3q_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3q_p16
   return vld3q_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP5]]
 uint8x8x3_t test_vld3_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld3_u8
   return vld3_u8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld3_u16
   return vld3_u16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld3_u32
   return vld3_u32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld3_u64
   return vld3_u64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP5]]
 int8x8x3_t test_vld3_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld3_s8
   return vld3_s8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld3_s16
   return vld3_s16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld3_s32
   return vld3_s32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld3_s64
   return vld3_s64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld3_f16
   return vld3_f16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld3_f32
   return vld3_f32(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld3_f64
   return vld3_f64(a);
-  // CHECK: {{ld1|ld3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP5]]
 poly8x8x3_t test_vld3_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld3_p8
   return vld3_p8(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld3_p16
   return vld3_p16(a);
-  // CHECK: ld3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP5]]
 uint8x16x4_t test_vld4q_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4q_u8
   return vld4q_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4q_u16
   return vld4q_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4q_u32
   return vld4q_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4q_u64
   return vld4q_u64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP5]]
 int8x16x4_t test_vld4q_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4q_s8
   return vld4q_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4q_s16
   return vld4q_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4q_s32
   return vld4q_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4q_s64
   return vld4q_s64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4q_f16
   return vld4q_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4.v4f32.p0v4f32(<4 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4q_f32
   return vld4q_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4.v2f64.p0v2f64(<2 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4q_f64
   return vld4q_f64(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0v16i8(<16 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP5]]
 poly8x16x4_t test_vld4q_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4q_p8
   return vld4q_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <8 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4.v8i16.p0v8i16(<8 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4q_p16
   return vld4q_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP5]]
 uint8x8x4_t test_vld4_u8(uint8_t const *a) {
-  // CHECK-LABEL: test_vld4_u8
   return vld4_u8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_u16(uint16_t const *a) {
-  // CHECK-LABEL: test_vld4_u16
   return vld4_u16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_u32(uint32_t const *a) {
-  // CHECK-LABEL: test_vld4_u32
   return vld4_u32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_u64(uint64_t const *a) {
-  // CHECK-LABEL: test_vld4_u64
   return vld4_u64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP5]]
 int8x8x4_t test_vld4_s8(int8_t const *a) {
-  // CHECK-LABEL: test_vld4_s8
   return vld4_s8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_s16(int16_t const *a) {
-  // CHECK-LABEL: test_vld4_s16
   return vld4_s16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i32>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4.v2i32.p0v2i32(<2 x i32>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_s32(int32_t const *a) {
-  // CHECK-LABEL: test_vld4_s32
   return vld4_s32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_s64(int64_t const *a) {
-  // CHECK-LABEL: test_vld4_s64
   return vld4_s64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_f16(float16_t const *a) {
-  // CHECK-LABEL: test_vld4_f16
   return vld4_f16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x float>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4.v2f32.p0v2f32(<2 x float>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_f32(float32_t const *a) {
-  // CHECK-LABEL: test_vld4_f32
   return vld4_f32(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x double>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4.v1f64.p0v1f64(<1 x double>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_f64(float64_t const *a) {
-  // CHECK-LABEL: test_vld4_f64
   return vld4_f64(a);
-  // CHECK: {{ld1|ld4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4.v8i8.p0v8i8(<8 x i8>* [[TMP1]])
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP5]]
 poly8x8x4_t test_vld4_p8(poly8_t const *a) {
-  // CHECK-LABEL: test_vld4_p8
   return vld4_p8(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i16>*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0v4i16(<4 x i16>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_p16(poly16_t const *a) {
-  // CHECK-LABEL: test_vld4_p16
   return vld4_p16(a);
-  // CHECK: ld4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_u8
   vst1q_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_u16
   vst1q_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32
   vst1q_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64
   vst1q_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_s8(int8_t *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_s8
   vst1q_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s16(int16_t *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_s16
   vst1q_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   store <4 x i32> [[TMP3]], <4 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s32(int32_t *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32
   vst1q_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_s64(int64_t *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64
   vst1q_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_f16
   vst1q_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   store <4 x float> [[TMP3]], <4 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f32(float32_t *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32
   vst1q_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.4s }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   store <2 x double> [[TMP3]], <2 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_f64(float64_t *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64
   vst1q_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.2d }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <16 x i8>*
+// CHECK:   store <16 x i8> %b, <16 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_p8
   vst1q_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.16b }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   store <8 x i16> [[TMP3]], <8 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_p16
   vst1q_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.8h }|str q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_u8(uint8_t *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_u8
   vst1_u8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u16(uint16_t *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_u16
   vst1_u16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u32(uint32_t *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_u32
   vst1_u32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_u64(uint64_t *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_u64
   vst1_u64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_s8(int8_t *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_s8
   vst1_s8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s16(int16_t *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_s16
   vst1_s16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   store <2 x i32> [[TMP3]], <2 x i32>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s32(int32_t *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_s32
   vst1_s32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_s64(int64_t *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_s64
   vst1_s64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_f16
   vst1_f16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   store <2 x float> [[TMP3]], <2 x float>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f32(float32_t *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_f32
   vst1_f32(a, b);
-  // CHECK: {{st1 { v[0-9]+.2s }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x double>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   store <1 x double> [[TMP3]], <1 x double>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_f64(float64_t *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_f64
   vst1_f64(a, b);
-  // CHECK: {{st1 { v[0-9]+.1d }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i8* %a to <8 x i8>*
+// CHECK:   store <8 x i8> %b, <8 x i8>* [[TMP0]]
+// CHECK:   ret void
 void test_vst1_p8(poly8_t *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_p8
   vst1_p8(a, b);
-  // CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   store <4 x i16> [[TMP3]], <4 x i16>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p16(poly16_t *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_p16
   vst1_p16(a, b);
-  // CHECK: {{st1 { v[0-9]+.4h }|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_u8
   vst2q_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_u16
   vst2q_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_u32
   vst2q_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_u64(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_u64
   vst2q_u64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_s8
   vst2q_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s16(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_s16
   vst2q_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s32(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_s32
   vst2q_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_s64(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_s64
   vst2q_s64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f16(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_f16
   vst2q_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f32(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_f32
   vst2q_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_f64(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_f64
   vst2q_f64(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_p8
   vst2q_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_p16
   vst2q_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_u8
   vst2_u8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u16(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_u16
   vst2_u16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u32(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_u32
   vst2_u32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_u64(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_u64
   vst2_u64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_s8(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_s8
   vst2_s8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s16(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_s16
   vst2_s16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s32(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_s32
   vst2_s32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_s64(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_s64
   vst2_s64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f16(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_f16
   vst2_f16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f32(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_f32
   vst2_f32(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_f64(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_f64
   vst2_f64(a, b);
-  // CHECK: {{st1|st2}} {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_p8
   vst2_p8(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p16(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_p16
   vst2_p16(a, b);
-  // CHECK: st2 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_u8
   vst3q_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_u16
   vst3q_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_u32
   vst3q_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_u64(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_u64
   vst3q_u64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_s8
   vst3q_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s16(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_s16
   vst3q_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s32(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_s32
   vst3q_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_s64(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_s64
   vst3q_s64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f16(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_f16
   vst3q_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f32(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_f32
   vst3q_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_f64(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_f64
   vst3q_f64(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_p8
   vst3q_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_p16
   vst3q_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_u8
   vst3_u8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u16(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_u16
   vst3_u16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u32(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_u32
   vst3_u32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_u64(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_u64
   vst3_u64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_s8(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_s8
   vst3_s8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s16(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_s16
   vst3_s16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s32(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_s32
   vst3_s32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_s64(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_s64
   vst3_s64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f16(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_f16
   vst3_f16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f32(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_f32
   vst3_f32(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_f64(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_f64
   vst3_f64(a, b);
-  // CHECK: {{st1|st3}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_p8
   vst3_p8(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p16(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_p16
   vst3_p16(a, b);
-  // CHECK: st3 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_u8
   vst4q_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_u16
   vst4q_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_u32
   vst4q_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_u64(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_u64
   vst4q_u64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_s8
   vst4q_s8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s16(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_s16
   vst4q_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s32(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_s32
   vst4q_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_s64(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_s64
   vst4q_s64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f16(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_f16
   vst4q_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f32(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_f32
   vst4q_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_f64(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_f64
   vst4q_f64(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_p8
   vst4q_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_p16
   vst4q_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_u8
   vst4_u8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u16(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_u16
   vst4_u16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u32(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_u32
   vst4_u32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_u64(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_u64
   vst4_u64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_s8(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_s8
   vst4_s8(a, b);
-// CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s16(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_s16
   vst4_s16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s32(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_s32
   vst4_s32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_s64(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_s64
   vst4_s64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f16(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_f16
   vst4_f16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f32(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_f32
   vst4_f32(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_f64(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_f64
   vst4_f64(a, b);
-  // CHECK: {{st1|st4}} {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_p8
   vst4_p8(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p16(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_p16
   vst4_p16(a, b);
-  // CHECK: st4 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld1q_u8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x2
   return vld1q_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld1q_u16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x2
   return vld1q_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld1q_u32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x2
   return vld1q_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld1q_u64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x2
   return vld1q_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld1q_s8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld1q_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x2
   return vld1q_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld1q_s16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld1q_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x2
   return vld1q_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld1q_s32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x2.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld1q_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x2
   return vld1q_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld1q_s64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld1q_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x2
   return vld1q_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld1q_f16_x2(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld1q_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x2
   return vld1q_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld1q_f32_x2(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x2.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld1q_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x2
   return vld1q_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld1q_f64_x2(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x2.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld1q_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x2
   return vld1q_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld1q_p8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x2.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x2
   return vld1q_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld1q_p16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x2.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x2
   return vld1q_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld1q_p64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x2.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld1q_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x2
   return vld1q_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld1_u8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x2
   return vld1_u8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld1_u16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x2
   return vld1_u16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld1_u32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x2
   return vld1_u32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld1_u64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x2
   return vld1_u64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld1_s8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld1_s8_x2(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x2
   return vld1_s8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld1_s16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld1_s16_x2(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x2
   return vld1_s16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld1_s32_x2(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x2.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld1_s32_x2(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x2
   return vld1_s32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld1_s64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld1_s64_x2(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x2
   return vld1_s64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld1_f16_x2(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld1_f16_x2(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x2
   return vld1_f16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld1_f32_x2(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x2.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld1_f32_x2(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x2
   return vld1_f32_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld1_f64_x2(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x2.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld1_f64_x2(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x2
   return vld1_f64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld1_p8_x2(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x2.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x2
   return vld1_p8_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld1_p16_x2(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x2.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x2
   return vld1_p16_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld1_p64_x2(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x2.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld1_p64_x2(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x2
   return vld1_p64_x2(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld1q_u8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x3
   return vld1q_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld1q_u16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x3
   return vld1q_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld1q_u32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x3
   return vld1q_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld1q_u64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x3
   return vld1q_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld1q_s8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld1q_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x3
   return vld1q_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld1q_s16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld1q_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x3
   return vld1q_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld1q_s32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x3.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld1q_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x3
   return vld1q_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld1q_s64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld1q_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x3
   return vld1q_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld1q_f16_x3(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld1q_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x3
   return vld1q_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld1q_f32_x3(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x3.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld1q_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x3
   return vld1q_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld1q_f64_x3(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x3.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld1q_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x3
   return vld1q_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld1q_p8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x3.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x3
   return vld1q_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld1q_p16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x3.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x3
   return vld1q_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld1q_p64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x3.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld1q_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x3
   return vld1q_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld1_u8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x3
   return vld1_u8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld1_u16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x3
   return vld1_u16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld1_u32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x3
   return vld1_u32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld1_u64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x3
   return vld1_u64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld1_s8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld1_s8_x3(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x3
   return vld1_s8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld1_s16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld1_s16_x3(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x3
   return vld1_s16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld1_s32_x3(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x3.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld1_s32_x3(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x3
   return vld1_s32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld1_s64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld1_s64_x3(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x3
   return vld1_s64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld1_f16_x3(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld1_f16_x3(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x3
   return vld1_f16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld1_f32_x3(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x3.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld1_f32_x3(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x3
   return vld1_f32_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld1_f64_x3(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x3.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld1_f64_x3(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x3
   return vld1_f64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld1_p8_x3(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x3.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x3
   return vld1_p8_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld1_p16_x3(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x3.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x3
   return vld1_p16_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld1_p64_x3(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x3.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld1_p64_x3(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x3
   return vld1_p64_x3(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld1q_u8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld1q_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1q_u8_x4
   return vld1q_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld1q_u16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1q_u16_x4
   return vld1q_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld1q_u32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1q_u32_x4
   return vld1q_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld1q_u64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1q_u64_x4
   return vld1q_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld1q_s8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld1q_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1q_s8_x4
   return vld1q_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld1q_s16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld1q_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1q_s16_x4
   return vld1q_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld1q_s32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld1x4.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD1XN]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld1q_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1q_s32_x4
   return vld1q_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld1q_s64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld1q_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1q_s64_x4
   return vld1q_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld1q_f16_x4(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld1q_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1q_f16_x4
   return vld1q_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld1q_f32_x4(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld1x4.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD1XN]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld1q_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1q_f32_x4
   return vld1q_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld1q_f64_x4(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld1x4.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD1XN]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld1q_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1q_f64_x4
   return vld1q_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld1q_p8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld1x4.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD1XN]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1q_p8_x4
   return vld1q_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld1q_p16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld1x4.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD1XN]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1q_p16_x4
   return vld1q_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld1q_p64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld1x4.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD1XN]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld1q_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1q_p64_x4
   return vld1q_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld1_u8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) {
-  // CHECK-LABEL: test_vld1_u8_x4
   return vld1_u8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld1_u16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) {
-  // CHECK-LABEL: test_vld1_u16_x4
   return vld1_u16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld1_u32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) {
-  // CHECK-LABEL: test_vld1_u32_x4
   return vld1_u32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld1_u64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) {
-  // CHECK-LABEL: test_vld1_u64_x4
   return vld1_u64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld1_s8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld1_s8_x4(int8_t const *a) {
-  // CHECK-LABEL: test_vld1_s8_x4
   return vld1_s8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld1_s16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld1_s16_x4(int16_t const *a) {
-  // CHECK-LABEL: test_vld1_s16_x4
   return vld1_s16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld1_s32_x4(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld1x4.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD1XN]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld1_s32_x4(int32_t const *a) {
-  // CHECK-LABEL: test_vld1_s32_x4
   return vld1_s32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld1_s64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld1_s64_x4(int64_t const *a) {
-  // CHECK-LABEL: test_vld1_s64_x4
   return vld1_s64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld1_f16_x4(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld1_f16_x4(float16_t const *a) {
-  // CHECK-LABEL: test_vld1_f16_x4
   return vld1_f16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld1_f32_x4(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD1XN:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld1x4.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD1XN]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld1_f32_x4(float32_t const *a) {
-  // CHECK-LABEL: test_vld1_f32_x4
   return vld1_f32_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld1_f64_x4(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld1x4.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD1XN]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld1_f64_x4(float64_t const *a) {
-  // CHECK-LABEL: test_vld1_f64_x4
   return vld1_f64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld1_p8_x4(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld1x4.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD1XN]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) {
-  // CHECK-LABEL: test_vld1_p8_x4
   return vld1_p8_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld1_p16_x4(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD1XN:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld1x4.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD1XN]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) {
-  // CHECK-LABEL: test_vld1_p16_x4
   return vld1_p16_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld1_p64_x4(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD1XN:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld1x4.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD1XN]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld1_p64_x4(poly64_t const *a) {
-  // CHECK-LABEL: test_vld1_p64_x4
   return vld1_p64_x4(a);
-  // CHECK: ld1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x2(uint8_t *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x2
   vst1q_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u16_x2(uint16_t *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x2
   vst1q_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u32_x2(uint32_t *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x2
   vst1q_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_u64_x2(uint64_t *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x2
   vst1q_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x2(int8_t *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x2
   vst1q_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s16_x2(int16_t *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x2
   vst1q_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x2(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i32.p0i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s32_x2(int32_t *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x2
   vst1q_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_s64_x2(int64_t *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x2
   vst1q_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x2(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f16_x2(float16_t *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x2
   vst1q_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x2(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4f32.p0f32(<4 x float> [[TMP7]], <4 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f32_x2(float32_t *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x2
   vst1q_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x2(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f64.p0f64(<2 x double> [[TMP7]], <2 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_f64_x2(float64_t *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x2
   vst1q_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x2(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x2(poly8_t *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x2
   vst1q_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x2(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i16.p0i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p16_x2(poly16_t *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x2
   vst1q_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x2(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i64.p0i64(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1q_p64_x2(poly64_t *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x2
   vst1q_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x2(uint8_t *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_u8_x2
   vst1_u8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u16_x2(uint16_t *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_u16_x2
   vst1_u16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u32_x2(uint32_t *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_u32_x2
   vst1_u32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_u64_x2(uint64_t *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_u64_x2
   vst1_u64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x2(int8_t *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_s8_x2
   vst1_s8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s16_x2(int16_t *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_s16_x2
   vst1_s16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x2(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2i32.p0i32(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i32* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s32_x2(int32_t *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_s32_x2
   vst1_s32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_s64_x2(int64_t *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_s64_x2
   vst1_s64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x2(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f16_x2(float16_t *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_f16_x2
   vst1_f16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x2(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v2f32.p0f32(<2 x float> [[TMP7]], <2 x float> [[TMP8]], float* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f32_x2(float32_t *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst1_f32_x2
   vst1_f32_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x2(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1f64.p0f64(<1 x double> [[TMP7]], <1 x double> [[TMP8]], double* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_f64_x2(float64_t *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_f64_x2
   vst1_f64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x2(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x2(poly8_t *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst1_p8_x2
   vst1_p8_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x2(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v4i16.p0i16(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i16* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p16_x2(poly16_t *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst1_p16_x2
   vst1_p16_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x2(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x2.v1i64.p0i64(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64* [[TMP9]])
+// CHECK:   ret void
 void test_vst1_p64_x2(poly64_t *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst1_p64_x2
   vst1_p64_x2(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x3(uint8_t *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x3
   vst1q_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u16_x3(uint16_t *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x3
   vst1q_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u32_x3(uint32_t *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x3
   vst1q_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_u64_x3(uint64_t *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x3
   vst1q_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x3(int8_t *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x3
   vst1q_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s16_x3(int16_t *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x3
   vst1q_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x3(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i32.p0i32(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s32_x3(int32_t *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x3
   vst1q_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_s64_x3(int64_t *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x3
   vst1q_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x3(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f16_x3(float16_t *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x3
   vst1q_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x3(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4f32.p0f32(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], float* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f32_x3(float32_t *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x3
   vst1q_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x3(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f64.p0f64(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], double* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_f64_x3(float64_t *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x3
   vst1q_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x3(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x3(poly8_t *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x3
   vst1q_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x3(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i16.p0i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_p16_x3(poly16_t *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x3
   vst1q_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x3(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i64.p0i64(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1q_p64_x3(poly64_t *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x3
   vst1q_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x3(uint8_t *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_u8_x3
   vst1_u8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u16_x3(uint16_t *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_u16_x3
   vst1_u16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u32_x3(uint32_t *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_u32_x3
   vst1_u32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_u64_x3(uint64_t *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_u64_x3
   vst1_u64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x3(int8_t *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_s8_x3
   vst1_s8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s16_x3(int16_t *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_s16_x3
   vst1_s16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x3(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2i32.p0i32(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i32* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s32_x3(int32_t *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_s32_x3
   vst1_s32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_s64_x3(int64_t *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_s64_x3
   vst1_s64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x3(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f16_x3(float16_t *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_f16_x3
   vst1_f16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x3(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v2f32.p0f32(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], float* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f32_x3(float32_t *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst1_f32_x3
   vst1_f32_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x3(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1f64.p0f64(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], double* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_f64_x3(float64_t *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_f64_x3
   vst1_f64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x3(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x3(poly8_t *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst1_p8_x3
   vst1_p8_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x3(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v4i16.p0i16(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i16* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_p16_x3(poly16_t *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst1_p16_x3
   vst1_p16_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x3(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x3.v1i64.p0i64(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64* [[TMP12]])
+// CHECK:   ret void
 void test_vst1_p64_x3(poly64_t *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst1_p64_x3
   vst1_p64_x3(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_u8_x4(uint8_t *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_u8_x4
   vst1q_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u16_x4(uint16_t *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_u16_x4
   vst1q_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u32_x4(uint32_t *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_u32_x4
   vst1q_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_u64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_u64_x4(uint64_t *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_u64_x4
   vst1q_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_s8_x4(int8_t *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_s8_x4
   vst1q_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s16_x4(int16_t *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_s16_x4
   vst1q_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s32_x4(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i32.p0i32(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s32_x4(int32_t *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_s32_x4
   vst1q_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_s64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_s64_x4(int64_t *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_s64_x4
   vst1q_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f16_x4(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f16_x4(float16_t *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_f16_x4
   vst1q_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f32_x4(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4f32.p0f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], float* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f32_x4(float32_t *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst1q_f32_x4
   vst1q_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_f64_x4(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f64.p0f64(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], double* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_f64_x4(float64_t *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_f64_x4
   vst1q_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p8_x4(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1q_p8_x4(poly8_t *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst1q_p8_x4
   vst1q_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p16_x4(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i16.p0i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_p16_x4(poly16_t *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst1q_p16_x4
   vst1q_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64_x4(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i64.p0i64(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1q_p64_x4(poly64_t *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst1q_p64_x4
   vst1q_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_u8_x4(uint8_t *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_u8_x4
   vst1_u8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u16_x4(uint16_t *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_u16_x4
   vst1_u16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u32_x4(uint32_t *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_u32_x4
   vst1_u32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_u64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_u64_x4(uint64_t *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_u64_x4
   vst1_u64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_s8_x4(int8_t *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_s8_x4
   vst1_s8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s16_x4(int16_t *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_s16_x4
   vst1_s16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s32_x4(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i32*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2i32.p0i32(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i32* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s32_x4(int32_t *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_s32_x4
   vst1_s32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_s64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_s64_x4(int64_t *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_s64_x4
   vst1_s64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f16_x4(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f16_x4(float16_t *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_f16_x4
   vst1_f16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f32_x4(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to float*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v2f32.p0f32(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], float* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f32_x4(float32_t *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst1_f32_x4
   vst1_f32_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_f64_x4(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to double*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1f64.p0f64(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], double* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_f64_x4(float64_t *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_f64_x4
   vst1_f64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p8_x4(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i8* %a)
+// CHECK:   ret void
 void test_vst1_p8_x4(poly8_t *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst1_p8_x4
   vst1_p8_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p16_x4(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i16*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v4i16.p0i16(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i16* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_p16_x4(poly16_t *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst1_p16_x4
   vst1_p16_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64_x4(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast i8* [[TMP2]] to i64*
+// CHECK:   call void @llvm.aarch64.neon.st1x4.v1i64.p0i64(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64* [[TMP15]])
+// CHECK:   ret void
 void test_vst1_p64_x4(poly64_t *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst1_p64_x4
   vst1_p64_x4(a, b);
-  // CHECK: st1 {{{ ?v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d ?}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vceqd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vceqd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vceqd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vceqd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vceqd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 int64_t test_vceqzd_s64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_s64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq i64 %a, 0
+// CHECK:   [[VCEQZD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZD_I]]
 int64_t test_vceqzd_u64(int64_t a) {
-// CHECK-LABEL: test_vceqzd_u64
-// CHECK: {{cmeq d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vceqzd_u64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcged_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcged_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcged_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp uge i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcged_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcged_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
     return (uint64_t)vcged_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgezd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge i64 %a, 0
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 int64_t test_vcgezd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgezd_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, #0x0|eor x0, x[0-9]+, x0, asr #63}}
   return (int64_t)vcgezd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcgtd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcgtd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcgtd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ugt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcgtd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcgtd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcgtd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt i64 %a, 0
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 int64_t test_vcgtzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcgtzd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, #0x0|cmp x0, #0}}
   return (int64_t)vcgtzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcled_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcled_s64
-// CHECK: {{cmge d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcled_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ule i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcled_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcled_u64
-// CHECK: {{cmhs d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcled_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vclezd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle i64 %a, 0
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 int64_t test_vclezd_s64(int64_t a) {
-// CHECK-LABEL: test_vclezd_s64
-// CHECK: {{cmle d[0-9]+, d[0-9]+, #0x0|cmp x0, #1}}
   return (int64_t)vclezd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 int64_t test_vcltd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vcltd_s64
-// CHECK: {{cmgt d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (int64_t)vcltd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp ult i64 %a, %b
+// CHECK:   [[VCEQD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQD_I]]
 uint64_t test_vcltd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vcltd_u64
-// CHECK: {{cmhi d[0-9]+, d[0-9]+, d[0-9]+|cmp x0, x1}}
   return (uint64_t)vcltd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltzd_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt i64 %a, 0
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 int64_t test_vcltzd_s64(int64_t a) {
-// CHECK-LABEL: test_vcltzd_s64
-// CHECK: {{cmlt d[0-9]+, d[0-9]+, #0x0|asr x0, x0, #63}}
   return (int64_t)vcltzd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vtstd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 int64_t test_vtstd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vtstd_s64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (int64_t)vtstd_s64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vtstd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and i64 %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne i64 [[TMP0]], 0
+// CHECK:   [[VTSTD_I:%.*]] = sext i1 [[TMP1]] to i64
+// CHECK:   ret i64 [[VTSTD_I]]
 uint64_t test_vtstd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vtstd_u64
-// CHECK: {{cmtst d[0-9]+, d[0-9]+, d[0-9]+|tst x1, x0}}
   return (uint64_t)vtstd_u64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vabsd_s64(i64 %a) #0 {
+// CHECK:   [[VABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.abs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VABSD_S64_I]]
 int64_t test_vabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vabsd_s64
-// CHECK: abs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vabsd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqabsb_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQABSB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQABSB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqabsb_s8(int8_t a) {
-// CHECK-LABEL: test_vqabsb_s8
-// CHECK: sqabs {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqabsb_s8(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqabsh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQABSH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQABSH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqabsh_s16(int16_t a) {
-// CHECK-LABEL: test_vqabsh_s16
-// CHECK: sqabs {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqabsh_s16(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqabss_s32(i32 %a) #0 {
+// CHECK:   [[VQABSS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQABSS_S32_I]]
 int32_t test_vqabss_s32(int32_t a) {
-// CHECK-LABEL: test_vqabss_s32
-// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqabss_s32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vqabsd_s64(i64 %a) #0 {
+// CHECK:   [[VQABSD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqabs.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQABSD_S64_I]]
 int64_t test_vqabsd_s64(int64_t a) {
-// CHECK-LABEL: test_vqabsd_s64
-// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqabsd_s64(a);
 }
 
+// CHECK-LABEL: define i64 @test_vnegd_s64(i64 %a) #0 {
+// CHECK:   [[VNEGD_I:%.*]] = sub i64 0, %a
+// CHECK:   ret i64 [[VNEGD_I]]
 int64_t test_vnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vnegd_s64
-// CHECK: neg {{[xd][0-9]+}}, {{[xd][0-9]+}}
   return (int64_t)vnegd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqnegb_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQNEGB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQNEGB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqnegb_s8(int8_t a) {
-// CHECK-LABEL: test_vqnegb_s8
-// CHECK: sqneg {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vqnegb_s8(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqnegh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQNEGH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQNEGH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqnegh_s16(int16_t a) {
-// CHECK-LABEL: test_vqnegh_s16
-// CHECK: sqneg {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vqnegh_s16(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqnegs_s32(i32 %a) #0 {
+// CHECK:   [[VQNEGS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqneg.i32(i32 %a) #4
+// CHECK:   ret i32 [[VQNEGS_S32_I]]
 int32_t test_vqnegs_s32(int32_t a) {
-// CHECK-LABEL: test_vqnegs_s32
-// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vqnegs_s32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vqnegd_s64(i64 %a) #0 {
+// CHECK:   [[VQNEGD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sqneg.i64(i64 %a) #4
+// CHECK:   ret i64 [[VQNEGD_S64_I]]
 int64_t test_vqnegd_s64(int64_t a) {
-// CHECK-LABEL: test_vqnegd_s64
-// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vqnegd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vuqaddb_s8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VUQADDB_S8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VUQADDB_S8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 int8_t test_vuqaddb_s8(int8_t a, int8_t b) {
-// CHECK-LABEL: test_vuqaddb_s8
-// CHECK: suqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (int8_t)vuqaddb_s8(a, b);
 }
 
+// CHECK-LABEL: define i16 @test_vuqaddh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VUQADDH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VUQADDH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 int16_t test_vuqaddh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vuqaddh_s16
-// CHECK: suqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int16_t)vuqaddh_s16(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vuqadds_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VUQADDS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.suqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VUQADDS_S32_I]]
 int32_t test_vuqadds_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vuqadds_s32
-// CHECK: suqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (int32_t)vuqadds_s32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vuqaddd_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VUQADDD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.suqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VUQADDD_S64_I]]
 int64_t test_vuqaddd_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vuqaddd_s64
-// CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (int64_t)vuqaddd_s64(a, b);
 }
 
+// CHECK-LABEL: define i8 @test_vsqaddb_u8(i8 %a, i8 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 %b, i64 0
+// CHECK:   [[VSQADDB_U8_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <8 x i8> [[VSQADDB_U8_I]], i64 0
+// CHECK:   ret i8 [[TMP2]]
 uint8_t test_vsqaddb_u8(uint8_t a, uint8_t b) {
-// CHECK-LABEL: test_vsqaddb_u8
-// CHECK: usqadd {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}
   return (uint8_t)vsqaddb_u8(a, b);
 }
 
+// CHECK-LABEL: define i16 @test_vsqaddh_u16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VSQADDH_U16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i16> [[VSQADDH_U16_I]], i64 0
+// CHECK:   ret i16 [[TMP2]]
 uint16_t test_vsqaddh_u16(uint16_t a, uint16_t b) {
-// CHECK-LABEL: test_vsqaddh_u16
-// CHECK: usqadd {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (uint16_t)vsqaddh_u16(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vsqadds_u32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VSQADDS_U32_I:%.*]] = call i32 @llvm.aarch64.neon.usqadd.i32(i32 %a, i32 %b) #4
+// CHECK:   ret i32 [[VSQADDS_U32_I]]
 uint32_t test_vsqadds_u32(uint32_t a, uint32_t b) {
-// CHECK-LABEL: test_vsqadds_u32
-// CHECK: usqadd {{s[0-9]+}}, {{s[0-9]+}}
   return (uint32_t)vsqadds_u32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vsqaddd_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSQADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.usqadd.i64(i64 %a, i64 %b) #4
+// CHECK:   ret i64 [[VSQADDD_U64_I]]
 uint64_t test_vsqaddd_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsqaddd_u64
-// CHECK: usqadd {{d[0-9]+}}, {{d[0-9]+}}
   return (uint64_t)vsqaddd_u64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmlalh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) {
 
-// CHECK-ARM64-LABEL: test_vqdmlalh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqadd {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlalh_s16(a, b, c);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmlals_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlals_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlals_s32
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlals_s32(a, b, c);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmlslh_s16(i32 %a, i16 %b, i16 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %c, i64 0
+// CHECK:   [[VQDMLXL_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[LANE0_I:%.*]] = extractelement <4 x i32> [[VQDMLXL_I]], i64 0
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0_I]]) #4
+// CHECK:   ret i32 [[VQDMLXL1_I]]
 int32_t test_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) {
 
-// CHECK-ARM64-LABEL: test_vqdmlslh_s16
-// CHECK-ARM64: sqdmull v[[PROD:[0-9]+]].4s, {{v[0-9]+.4h}}, {{v[0-9]+.4h}}
-// CHECK-ARM64: sqsub {{s[0-9]+}}, {{s[0-9]+}}, s[[PROD]]
   return (int32_t)vqdmlslh_s16(a, b, c);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmlsls_s32(i64 %a, i32 %b, i32 %c) #0 {
+// CHECK:   [[VQDMLXL_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 %c) #4
+// CHECK:   [[VQDMLXL1_I:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL_I]]) #4
+// CHECK:   ret i64 [[VQDMLXL1_I]]
 int64_t test_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) {
-// CHECK-LABEL: test_vqdmlsls_s32
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmlsls_s32(a, b, c);
 }
 
+// CHECK-LABEL: define i32 @test_vqdmullh_s16(i16 %a, i16 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP1:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP0]], <4 x i16> [[TMP1]]) #4
+// CHECK:   [[TMP2:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP2]]
 int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
-// CHECK-LABEL: test_vqdmullh_s16
-// CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}
   return (int32_t)vqdmullh_s16(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vqdmulls_s32(i32 %a, i32 %b) #0 {
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 %b) #4
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
-// CHECK-LABEL: test_vqdmulls_s32
-// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return (int64_t)vqdmulls_s32(a, b);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovunh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVUNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVUNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovunh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovunh_s16
-// CHECK: sqxtun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovunh_s16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovuns_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVUNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVUNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovuns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovuns_s32
-// CHECK: sqxtun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovuns_s32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovund_s64(i64 %a) #0 {
+// CHECK:   [[VQMOVUND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtun.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVUND_S64_I]]
 int32_t test_vqmovund_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovund_s64
-// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovund_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovnh_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_S16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_S16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_s16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_s16
-// CHECK: sqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_s16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovns_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_S32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_S32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_s32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_s32
-// CHECK: sqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_s32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovnd_s64(i64 %a) #0 {
+// CHECK:   [[VQMOVND_S64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.sqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_S64_I]]
 int32_t test_vqmovnd_s64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_s64
-// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_s64(a);
 }
 
+// CHECK-LABEL: define i8 @test_vqmovnh_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQMOVNH_U16_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQMOVNH_U16_I]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqmovnh_u16(int16_t a) {
-// CHECK-LABEL: test_vqmovnh_u16
-// CHECK: uqxtn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}
   return (int8_t)vqmovnh_u16(a);
 }
 
+// CHECK-LABEL: define i16 @test_vqmovns_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQMOVNS_U32_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[TMP0]]) #4
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQMOVNS_U32_I]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqmovns_u32(int32_t a) {
-// CHECK-LABEL: test_vqmovns_u32
-// CHECK: uqxtn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}
   return (int16_t)vqmovns_u32(a);
 }
 
+// CHECK-LABEL: define i32 @test_vqmovnd_u64(i64 %a) #0 {
+// CHECK:   [[VQMOVND_U64_I:%.*]] = call i32 @llvm.aarch64.neon.scalar.uqxtn.i32.i64(i64 %a) #4
+// CHECK:   ret i32 [[VQMOVND_U64_I]]
 int32_t test_vqmovnd_u64(int64_t a) {
-// CHECK-LABEL: test_vqmovnd_u64
-// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
   return (int32_t)vqmovnd_u64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vceqs_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vceqs_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vceqs_f32
-// CHECK: {{fcmeq s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vceqs_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vceqd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vceqd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vceqd_f64
-// CHECK: {{fcmeq d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vceqd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vceqzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq float %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCEQZ_I]]
 uint32_t test_vceqzs_f32(float32_t a) {
-// CHECK-LABEL: test_vceqzs_f32
-// CHECK: {{fcmeq s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vceqzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vceqzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oeq double %a, 0.000000e+00
+// CHECK:   [[VCEQZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCEQZ_I]]
 uint64_t test_vceqzd_f64(float64_t a) {
-// CHECK-LABEL: test_vceqzd_f64
-// CHECK: {{fcmeq d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vceqzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcges_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcges_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcges_f32
-// CHECK: {{fcmge s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcges_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcged_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcged_f64
-// CHECK: {{fcmge d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcged_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcgezs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge float %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGEZ_I]]
 uint32_t test_vcgezs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgezs_f32
-// CHECK: {{fcmge s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgezs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgezd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp oge double %a, 0.000000e+00
+// CHECK:   [[VCGEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGEZ_I]]
 uint64_t test_vcgezd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgezd_f64
-// CHECK: {{fcmge d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgezd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcgts_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcgts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcgts_f32
-// CHECK: {{fcmgt s0, s0, s1|fcmp s0, s1}}
   return (uint32_t)vcgts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcgtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcgtd_f64
-// CHECK: {{fcmgt d0, d0, d1|fcmp d0, d1}}
   return (uint64_t)vcgtd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcgtzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt float %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCGTZ_I]]
 uint32_t test_vcgtzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcgtzs_f32
-// CHECK: {{fcmgt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcgtzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcgtzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ogt double %a, 0.000000e+00
+// CHECK:   [[VCGTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCGTZ_I]]
 uint64_t test_vcgtzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcgtzd_f64
-// CHECK: {{fcmgt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcgtzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcles_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vcles_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcles_f32
-// CHECK: {{fcmge s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vcles_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcled_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcled_f64
-// CHECK: {{fcmge d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcled_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vclezs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole float %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLEZ_I]]
 uint32_t test_vclezs_f32(float32_t a) {
-// CHECK-LABEL: test_vclezs_f32
-// CHECK: {{fcmle s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vclezs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vclezd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp ole double %a, 0.000000e+00
+// CHECK:   [[VCLEZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLEZ_I]]
 uint64_t test_vclezd_f64(float64_t a) {
-// CHECK-LABEL: test_vclezd_f64
-// CHECK: {{fcmle d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vclezd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vclts_f32(float %a, float %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCMPD_I]]
 uint32_t test_vclts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vclts_f32
-// CHECK: {{fcmgt s0, s1, s0|fcmp s0, s1}}
   return (uint32_t)vclts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcltd_f64(double %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, %b
+// CHECK:   [[VCMPD_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCMPD_I]]
 uint64_t test_vcltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcltd_f64
-// CHECK: {{fcmgt d0, d1, d0|fcmp d0, d1}}
   return (uint64_t)vcltd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcltzs_f32(float %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt float %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i32
+// CHECK:   ret i32 [[VCLTZ_I]]
 uint32_t test_vcltzs_f32(float32_t a) {
-// CHECK-LABEL: test_vcltzs_f32
-// CHECK: {{fcmlt s0, s0, #0.0|fcmp s0, #0.0}}
   return (uint32_t)vcltzs_f32(a);
 }
 
+// CHECK-LABEL: define i64 @test_vcltzd_f64(double %a) #0 {
+// CHECK:   [[TMP0:%.*]] = fcmp olt double %a, 0.000000e+00
+// CHECK:   [[VCLTZ_I:%.*]] = sext i1 [[TMP0]] to i64
+// CHECK:   ret i64 [[VCLTZ_I]]
 uint64_t test_vcltzd_f64(float64_t a) {
-// CHECK-LABEL: test_vcltzd_f64
-// CHECK: {{fcmlt d0, d0, #0.0|fcmp d0, #0.0}}
   return (uint64_t)vcltzd_f64(a);
 }
 
+// CHECK-LABEL: define i32 @test_vcages_f32(float %a, float %b) #0 {
+// CHECK:   [[VCAGES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGES_F32_I]]
 uint32_t test_vcages_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcages_f32
-// CHECK: facge s0, s0, s1
   return (uint32_t)vcages_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaged_f64(double %a, double %b) #0 {
+// CHECK:   [[VCAGED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGED_F64_I]]
 uint64_t test_vcaged_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaged_f64
-// CHECK: facge d0, d0, d1
   return (uint64_t)vcaged_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcagts_f32(float %a, float %b) #0 {
+// CHECK:   [[VCAGTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %a, float %b) #4
+// CHECK:   ret i32 [[VCAGTS_F32_I]]
 uint32_t test_vcagts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcagts_f32
-// CHECK: facgt s0, s0, s1
   return (uint32_t)vcagts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcagtd_f64(double %a, double %b) #0 {
+// CHECK:   [[VCAGTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %a, double %b) #4
+// CHECK:   ret i64 [[VCAGTD_F64_I]]
 uint64_t test_vcagtd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcagtd_f64
-// CHECK: facgt d0, d0, d1
   return (uint64_t)vcagtd_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcales_f32(float %a, float %b) #0 {
+// CHECK:   [[VCALES_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facge.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALES_F32_I]]
 uint32_t test_vcales_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcales_f32
-// CHECK: facge s0, s1, s0
   return (uint32_t)vcales_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaled_f64(double %a, double %b) #0 {
+// CHECK:   [[VCALED_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facge.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALED_F64_I]]
 uint64_t test_vcaled_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaled_f64
-// CHECK: facge d0, d1, d0
   return (uint64_t)vcaled_f64(a, b);
 }
 
+// CHECK-LABEL: define i32 @test_vcalts_f32(float %a, float %b) #0 {
+// CHECK:   [[VCALTS_F32_I:%.*]] = call i32 @llvm.aarch64.neon.facgt.i32.f32(float %b, float %a) #4
+// CHECK:   ret i32 [[VCALTS_F32_I]]
 uint32_t test_vcalts_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vcalts_f32
-// CHECK: facgt s0, s1, s0
   return (uint32_t)vcalts_f32(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vcaltd_f64(double %a, double %b) #0 {
+// CHECK:   [[VCALTD_F64_I:%.*]] = call i64 @llvm.aarch64.neon.facgt.i64.f64(double %b, double %a) #4
+// CHECK:   ret i64 [[VCALTD_F64_I]]
 uint64_t test_vcaltd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vcaltd_f64
-// CHECK: facgt d0, d1, d0
   return (uint64_t)vcaltd_f64(a, b);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_s64(i64 %a) #0 {
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %a, 1
+// CHECK:   ret i64 [[SHRD_N]]
 int64_t test_vshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshrd_n_s64
-// CHECK: {{sshr d[0-9]+, d[0-9]+, #1|asr x0, x0, #1}}
   return (int64_t)vshrd_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_u64(i64 %a) #0 {
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64(uint64_t a) {
 
-// CHECK-ARM64-LABEL: test_vshrd_n_u64
-// CHECK-ARM64: mov x0, xzr
   return (uint64_t)vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: define i64 @test_vshrd_n_u64_2() #0 {
+// CHECK:   ret i64 0
 uint64_t test_vshrd_n_u64_2() {
 
-// CHECK-ARM64-LABEL: test_vshrd_n_u64_2
-// CHECK-ARM64: mov x0, xzr
   uint64_t a = UINT64_C(0xf000000000000000);
   return vshrd_n_u64(a, 64);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: ushr {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrshrd_n_s64(i64 %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 int64_t test_vrshrd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vrshrd_n_s64
-// CHECK: srshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vrshrd_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: srshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrshrd_n_u64(i64 %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %a, i64 -63)
+// CHECK:   ret i64 [[VRSHR_N]]
 uint64_t test_vrshrd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vrshrd_n_u64
-// CHECK: urshr {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrshrd_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: urshr d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrshr_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[SHRD_N:%.*]] = ashr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 int64_t test_vsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrad_n_s64
-// CHECK: {{ssra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, asr #63}}
   return (int64_t)vsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: ssra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[SHRD_N:%.*]] = lshr i64 %b, 63
+// CHECK:   [[TMP0:%.*]] = add i64 %a, [[SHRD_N]]
+// CHECK:   ret i64 [[TMP0]]
 uint64_t test_vsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrad_n_u64
-// CHECK: {{usra d[0-9]+, d[0-9]+, #63|add x0, x0, x1, lsr #63}}
   return (uint64_t)vsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define i64 @test_vsrad_n_u64_2(i64 %a, i64 %b) #0 {
+// CHECK:   ret i64 %a
 uint64_t test_vsrad_n_u64_2(uint64_t a, uint64_t b) {
 
-// CHECK-ARM64-LABEL: test_vsrad_n_u64_2
-// CHECK-ARM64-NOT: add
   return (uint64_t)vsrad_n_u64(a, b, 64);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: usra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrsrad_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.srshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 int64_t test_vrsrad_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vrsrad_n_s64
-// CHECK: {{srsra d[0-9]+, d[0-9]+, #63}}
   return (int64_t)vrsrad_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: srsra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vrsrad_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call i64 @llvm.aarch64.neon.urshl.i64(i64 %b, i64 -63)
+// CHECK:   [[TMP1:%.*]] = add i64 %a, [[TMP0]]
+// CHECK:   ret i64 [[TMP1]]
 uint64_t test_vrsrad_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vrsrad_n_u64
-// CHECK: ursra {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vrsrad_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[TMP2]], [[VRSHR_N1]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: ursra d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vrsra_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshld_n_s64(i64 %a) #0 {
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 1
+// CHECK:   ret i64 [[SHLD_N]]
 int64_t test_vshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vshld_n_s64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #1|lsl x0, x0, #1}}
   return (int64_t)vshld_n_s64(a, 1);
 }
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vshld_n_u64(i64 %a) #0 {
+// CHECK:   [[SHLD_N:%.*]] = shl i64 %a, 63
+// CHECK:   ret i64 [[SHLD_N]]
 uint64_t test_vshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vshld_n_u64
-// CHECK: {{shl d[0-9]+, d[0-9]+, #63|lsl x0, x0, #63}}
   return (uint64_t)vshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: shl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlb_n_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlb_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlb_n_s8
-// CHECK: sqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlb_n_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshlh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshlh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshlh_n_s16
-// CHECK: sqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshlh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshls_n_s32(i32 %a) #0 {
+// CHECK:   [[VQSHLS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_S32]]
 int32_t test_vqshls_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshls_n_s32
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshls_n_s32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshld_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.sqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 int64_t test_vqshld_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshld_n_s64
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshld_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_s8
   return vqshl_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s8
   return vqshlq_n_s8(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_s16
   return vqshl_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s16
   return vqshlq_n_s16(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_s32
   return vqshl_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s32
   return vqshlq_n_s32(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_s64
   return vqshlq_n_s64(a, 0);
-  // CHECK: sqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %a, <8 x i8> zeroinitializer)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vqshl_n_u8
   return vqshl_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %a, <16 x i8> zeroinitializer)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u8
   return vqshlq_n_u8(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> zeroinitializer)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vqshl_n_u16
   return vqshl_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> zeroinitializer)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u16
   return vqshlq_n_u16(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> zeroinitializer)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vqshl_n_u32
   return vqshl_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> zeroinitializer)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u32
   return vqshlq_n_u32(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> zeroinitializer)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqshlq_n_u64
   return vqshlq_n_u64(a, 0);
-  // CHECK: uqshl {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: sqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlb_n_u8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLB_N_U8:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLB_N_U8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshlb_n_u8(uint8_t a) {
-// CHECK-LABEL: test_vqshlb_n_u8
-// CHECK: uqshl {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (uint8_t)vqshlb_n_u8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshlh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLH_N_U16:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLH_N_U16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshlh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshlh_n_u16
-// CHECK: uqshl {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (uint16_t)vqshlh_n_u16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshls_n_u32(i32 %a) #0 {
+// CHECK:   [[VQSHLS_N_U32:%.*]] = call i32 @llvm.aarch64.neon.uqshl.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLS_N_U32]]
 uint32_t test_vqshls_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshls_n_u32
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (uint32_t)vqshls_n_u32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshld_n_u64(i64 %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call i64 @llvm.aarch64.neon.uqshl.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHL_N]]
 uint64_t test_vqshld_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshld_n_u64
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vqshld_n_u64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: uqshl d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshl_n_u64(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshlub_n_s8(i8 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i8> undef, i8 %a, i64 0
+// CHECK:   [[VQSHLUB_N_S8:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> [[TMP0]], <8 x i8> <i8 7, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHLUB_N_S8]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshlub_n_s8(int8_t a) {
-// CHECK-LABEL: test_vqshlub_n_s8
-// CHECK: sqshlu {{b[0-9]+|v[0-9]+.8b}}, {{b[0-9]+|v[0-9]+.8b}}, #7
   return (int8_t)vqshlub_n_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vqshluh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHLUH_N_S16:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> [[TMP0]], <4 x i16> <i16 15, i16 undef, i16 undef, i16 undef>)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHLUH_N_S16]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshluh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshluh_n_s16
-// CHECK: sqshlu {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, #15
   return (int16_t)vqshluh_n_s16(a, 15);
 }
 
+// CHECK-LABEL: define i32 @test_vqshlus_n_s32(i32 %a) #0 {
+// CHECK:   [[VQSHLUS_N_S32:%.*]] = call i32 @llvm.aarch64.neon.sqshlu.i32(i32 %a, i32 31)
+// CHECK:   ret i32 [[VQSHLUS_N_S32]]
 int32_t test_vqshlus_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshlus_n_s32
-// CHECK: sqshlu {{s[0-9]+}}, {{s[0-9]+}}, #31
   return (int32_t)vqshlus_n_s32(a, 31);
 }
 
+// CHECK-LABEL: define i64 @test_vqshlud_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call i64 @llvm.aarch64.neon.sqshlu.i64(i64 %a, i64 63)
+// CHECK:   ret i64 [[VQSHLU_N]]
 int64_t test_vqshlud_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshlud_n_s64
-// CHECK: sqshlu {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vqshlud_n_s64(a, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: sqshlu d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vqshlu_n_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSRID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_S64]], <1 x i64> [[VSRID_N_S641]], i32 63)
+// CHECK:   [[VSRID_N_S643:%.*]] = bitcast <1 x i64> [[VSRID_N_S642]] to i64
+// CHECK:   ret i64 [[VSRID_N_S643]]
 int64_t test_vsrid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vsrid_n_s64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vsrid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vsrid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSRID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSRID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSRID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRID_N_U64]], <1 x i64> [[VSRID_N_U641]], i32 63)
+// CHECK:   [[VSRID_N_U643:%.*]] = bitcast <1 x i64> [[VSRID_N_U642]] to i64
+// CHECK:   ret i64 [[VSRID_N_U643]]
 uint64_t test_vsrid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vsrid_n_u64
-// CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vsrid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: sri d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsri_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vslid_n_s64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSLID_N_S64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_S641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_S642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_S64]], <1 x i64> [[VSLID_N_S641]], i32 63)
+// CHECK:   [[VSLID_N_S643:%.*]] = bitcast <1 x i64> [[VSLID_N_S642]] to i64
+// CHECK:   ret i64 [[VSLID_N_S643]]
 int64_t test_vslid_n_s64(int64_t a, int64_t b) {
-// CHECK-LABEL: test_vslid_n_s64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (int64_t)vslid_n_s64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vslid_n_u64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VSLID_N_U64:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[VSLID_N_U641:%.*]] = bitcast i64 %b to <1 x i64>
+// CHECK:   [[VSLID_N_U642:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLID_N_U64]], <1 x i64> [[VSLID_N_U641]], i32 63)
+// CHECK:   [[VSLID_N_U643:%.*]] = bitcast <1 x i64> [[VSLID_N_U642]] to i64
+// CHECK:   ret i64 [[VSLID_N_U643]]
 uint64_t test_vslid_n_u64(uint64_t a, uint64_t b) {
-// CHECK-LABEL: test_vslid_n_u64
-// CHECK: sli {{d[0-9]+}}, {{d[0-9]+}}, #63
   return (uint64_t)vslid_n_u64(a, b, 63);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsli.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], i32 1)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: sli d{{[0-9]+}}, d{{[0-9]+}}, #1
   return vsli_n_u64(a, b, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrnh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_s16
-// CHECK: sqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshrns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshrns_n_s32
-// CHECK: sqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrnd_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_S64]]
 int32_t test_vqshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_s64
-// CHECK: sqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrnh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqshrnh_n_u16
-// CHECK: uqshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshrns_n_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqshrns_n_u32
-// CHECK: uqshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrnd_n_u64(i64 %a) #0 {
+// CHECK:   [[VQSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRND_N_U64]]
 uint32_t test_vqshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqshrnd_n_u64
-// CHECK: uqshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrnh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrnh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_s16
-// CHECK: sqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrnh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshrns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshrns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_s32
-// CHECK: sqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshrns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrnd_n_s64(i64 %a) #0 {
+// CHECK:   [[VQRSHRND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_S64]]
 int32_t test_vqrshrnd_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_s64
-// CHECK: sqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrnd_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrnh_n_u16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRNH_N_U16:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqrshrn.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRNH_N_U16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 uint8_t test_vqrshrnh_n_u16(uint16_t a) {
-// CHECK-LABEL: test_vqrshrnh_n_u16
-// CHECK: uqrshrn {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (uint8_t)vqrshrnh_n_u16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshrns_n_u32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRNS_N_U32:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqrshrn.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRNS_N_U32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 uint16_t test_vqrshrns_n_u32(uint32_t a) {
-// CHECK-LABEL: test_vqrshrns_n_u32
-// CHECK: uqrshrn {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (uint16_t)vqrshrns_n_u32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrnd_n_u64(i64 %a) #0 {
+// CHECK:   [[VQRSHRND_N_U64:%.*]] = call i32 @llvm.aarch64.neon.uqrshrn.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRND_N_U64]]
 uint32_t test_vqrshrnd_n_u64(uint64_t a) {
-// CHECK-LABEL: test_vqrshrnd_n_u64
-// CHECK: uqrshrn {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (uint32_t)vqrshrnd_n_u64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqshrunh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqshrunh_n_s16
-// CHECK: sqshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqshruns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqshruns_n_s32
-// CHECK: sqshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqshrund_n_s64(i64 %a) #0 {
+// CHECK:   [[VQSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQSHRUND_N_S64]]
 int32_t test_vqshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqshrund_n_s64
-// CHECK: sqshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define i8 @test_vqrshrunh_n_s16(i16 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <8 x i16> undef, i16 %a, i64 0
+// CHECK:   [[VQRSHRUNH_N_S16:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqrshrun.v8i8(<8 x i16> [[TMP0]], i32 8)
+// CHECK:   [[TMP1:%.*]] = extractelement <8 x i8> [[VQRSHRUNH_N_S16]], i64 0
+// CHECK:   ret i8 [[TMP1]]
 int8_t test_vqrshrunh_n_s16(int16_t a) {
-// CHECK-LABEL: test_vqrshrunh_n_s16
-// CHECK: sqrshrun {{b[0-9]+|v[0-9]+.8b}}, {{h[0-9]+|v[0-9]+.8h}}, #8
   return (int8_t)vqrshrunh_n_s16(a, 8);
 }
 
+// CHECK-LABEL: define i16 @test_vqrshruns_n_s32(i32 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 %a, i64 0
+// CHECK:   [[VQRSHRUNS_N_S32:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrshrun.v4i16(<4 x i32> [[TMP0]], i32 16)
+// CHECK:   [[TMP1:%.*]] = extractelement <4 x i16> [[VQRSHRUNS_N_S32]], i64 0
+// CHECK:   ret i16 [[TMP1]]
 int16_t test_vqrshruns_n_s32(int32_t a) {
-// CHECK-LABEL: test_vqrshruns_n_s32
-// CHECK: sqrshrun {{h[0-9]+|v[0-9]+.4h}}, {{s[0-9]+|v[0-9]+.4s}}, #16
   return (int16_t)vqrshruns_n_s32(a, 16);
 }
 
+// CHECK-LABEL: define i32 @test_vqrshrund_n_s64(i64 %a) #0 {
+// CHECK:   [[VQRSHRUND_N_S64:%.*]] = call i32 @llvm.aarch64.neon.sqrshrun.i32(i64 %a, i32 32)
+// CHECK:   ret i32 [[VQRSHRUND_N_S64]]
 int32_t test_vqrshrund_n_s64(int64_t a) {
-// CHECK-LABEL: test_vqrshrund_n_s64
-// CHECK: sqrshrun {{s[0-9]+}}, {{d[0-9]+}}, #32
   return (int32_t)vqrshrund_n_s64(a, 32);
 }
 
+// CHECK-LABEL: define float @test_vcvts_n_f32_s32(i32 %a) #0 {
+// CHECK:   [[VCVTS_N_F32_S32:%.*]] = call float @llvm.aarch64.neon.vcvtfxs2fp.f32.i32(i32 %a, i32 1)
+// CHECK:   ret float [[VCVTS_N_F32_S32]]
 float32_t test_vcvts_n_f32_s32(int32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_s32
-// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #1
   return vcvts_n_f32_s32(a, 1);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_n_f64_s64(i64 %a) #0 {
+// CHECK:   [[VCVTD_N_F64_S64:%.*]] = call double @llvm.aarch64.neon.vcvtfxs2fp.f64.i64(i64 %a, i32 1)
+// CHECK:   ret double [[VCVTD_N_F64_S64]]
 float64_t test_vcvtd_n_f64_s64(int64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_s64
-// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #1
   return vcvtd_n_f64_s64(a, 1);
 }
 
+// CHECK-LABEL: define float @test_vcvts_n_f32_u32(i32 %a) #0 {
+// CHECK:   [[VCVTS_N_F32_U32:%.*]] = call float @llvm.aarch64.neon.vcvtfxu2fp.f32.i32(i32 %a, i32 32)
+// CHECK:   ret float [[VCVTS_N_F32_U32]]
 float32_t test_vcvts_n_f32_u32(uint32_t a) {
-// CHECK-LABEL: test_vcvts_n_f32_u32
-// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #32
   return vcvts_n_f32_u32(a, 32);
 }
 
+// CHECK-LABEL: define double @test_vcvtd_n_f64_u64(i64 %a) #0 {
+// CHECK:   [[VCVTD_N_F64_U64:%.*]] = call double @llvm.aarch64.neon.vcvtfxu2fp.f64.i64(i64 %a, i32 64)
+// CHECK:   ret double [[VCVTD_N_F64_U64]]
 float64_t test_vcvtd_n_f64_u64(uint64_t a) {
-// CHECK-LABEL: test_vcvtd_n_f64_u64
-// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #64
   return vcvtd_n_f64_u64(a, 64);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_n_s32_f32(float %a) #0 {
+// CHECK:   [[VCVTS_N_S32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxs.i32.f32(float %a, i32 1)
+// CHECK:   ret i32 [[VCVTS_N_S32_F32]]
 int32_t test_vcvts_n_s32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_s32_f32
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #1
   return (int32_t)vcvts_n_s32_f32(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_n_s64_f64(double %a) #0 {
+// CHECK:   [[VCVTD_N_S64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxs.i64.f64(double %a, i32 1)
+// CHECK:   ret i64 [[VCVTD_N_S64_F64]]
 int64_t test_vcvtd_n_s64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_s64_f64
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #1
   return (int64_t)vcvtd_n_s64_f64(a, 1);
 }
 
+// CHECK-LABEL: define i32 @test_vcvts_n_u32_f32(float %a) #0 {
+// CHECK:   [[VCVTS_N_U32_F32:%.*]] = call i32 @llvm.aarch64.neon.vcvtfp2fxu.i32.f32(float %a, i32 32)
+// CHECK:   ret i32 [[VCVTS_N_U32_F32]]
 uint32_t test_vcvts_n_u32_f32(float32_t a) {
-// CHECK-LABEL: test_vcvts_n_u32_f32
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #32
   return (uint32_t)vcvts_n_u32_f32(a, 32);
 }
 
+// CHECK-LABEL: define i64 @test_vcvtd_n_u64_f64(double %a) #0 {
+// CHECK:   [[VCVTD_N_U64_F64:%.*]] = call i64 @llvm.aarch64.neon.vcvtfp2fxu.i64.f64(double %a, i32 64)
+// CHECK:   ret i64 [[VCVTD_N_U64_F64]]
 uint64_t test_vcvtd_n_u64_f64(float64_t a) {
-// CHECK-LABEL: test_vcvtd_n_u64_f64
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #64
   return (uint64_t)vcvtd_n_u64_f64(a, 64);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f64(float64x1_t a) {
   return vreinterpret_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p64(poly64x1_t a) {
   return vreinterpret_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f64(float64x1_t a) {
   return vreinterpret_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p64(poly64x1_t a) {
   return vreinterpret_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f64(float64x1_t a) {
   return vreinterpret_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p64(poly64x1_t a) {
   return vreinterpret_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f64(float64x1_t a) {
   return vreinterpret_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_p64(poly64x1_t a) {
   return vreinterpret_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f64(float64x1_t a) {
   return vreinterpret_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p64(poly64x1_t a) {
   return vreinterpret_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f64(float64x1_t a) {
   return vreinterpret_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p64(poly64x1_t a) {
   return vreinterpret_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f64(float64x1_t a) {
   return vreinterpret_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p64(poly64x1_t a) {
   return vreinterpret_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f64(float64x1_t a) {
   return vreinterpret_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_p64(poly64x1_t a) {
   return vreinterpret_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f64(float64x1_t a) {
   return vreinterpret_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p64(poly64x1_t a) {
   return vreinterpret_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f64(float64x1_t a) {
   return vreinterpret_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p64(poly64x1_t a) {
   return vreinterpret_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s8(int8x8_t a) {
   return vreinterpret_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s16(int16x4_t a) {
   return vreinterpret_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s32(int32x2_t a) {
   return vreinterpret_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_s64(int64x1_t a) {
   return vreinterpret_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u8(uint8x8_t a) {
   return vreinterpret_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u16(uint16x4_t a) {
   return vreinterpret_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u32(uint32x2_t a) {
   return vreinterpret_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_u64(uint64x1_t a) {
   return vreinterpret_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f16(float16x4_t a) {
   return vreinterpret_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_f32(float32x2_t a) {
   return vreinterpret_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p8(poly8x8_t a) {
   return vreinterpret_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p16(poly16x4_t a) {
   return vreinterpret_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x double> @test_vreinterpret_f64_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <1 x double>
+// CHECK:   ret <1 x double> [[TMP0]]
 float64x1_t test_vreinterpret_f64_p64(poly64x1_t a) {
   return vreinterpret_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f64(float64x1_t a) {
   return vreinterpret_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p64(poly64x1_t a) {
   return vreinterpret_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f64(float64x1_t a) {
   return vreinterpret_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p64(poly64x1_t a) {
   return vreinterpret_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s8(int8x8_t a) {
   return vreinterpret_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s16(int16x4_t a) {
   return vreinterpret_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_s32(int32x2_t a) {
   return vreinterpret_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_s64(int64x1_t a) {
   return vreinterpret_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u8(uint8x8_t a) {
   return vreinterpret_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u16(uint16x4_t a) {
   return vreinterpret_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_u32(uint32x2_t a) {
   return vreinterpret_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 poly64x1_t test_vreinterpret_p64_u64(uint64x1_t a) {
   return vreinterpret_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f16(float16x4_t a) {
   return vreinterpret_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f32(float32x2_t a) {
   return vreinterpret_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_f64(float64x1_t a) {
   return vreinterpret_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p8(poly8x8_t a) {
   return vreinterpret_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_p64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vreinterpret_p64_p16(poly16x4_t a) {
   return vreinterpret_p64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f64(float64x2_t a) {
   return vreinterpretq_s8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p64(poly64x2_t a) {
   return vreinterpretq_s8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f64(float64x2_t a) {
   return vreinterpretq_s16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p64(poly64x2_t a) {
   return vreinterpretq_s16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f64(float64x2_t a) {
   return vreinterpretq_s32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p64(poly64x2_t a) {
   return vreinterpretq_s32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f64(float64x2_t a) {
   return vreinterpretq_s64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_p64(poly64x2_t a) {
   return vreinterpretq_s64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f64(float64x2_t a) {
   return vreinterpretq_u8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p64(poly64x2_t a) {
   return vreinterpretq_u8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f64(float64x2_t a) {
   return vreinterpretq_u16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p64(poly64x2_t a) {
   return vreinterpretq_u16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f64(float64x2_t a) {
   return vreinterpretq_u32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p64(poly64x2_t a) {
   return vreinterpretq_u32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f64(float64x2_t a) {
   return vreinterpretq_u64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_p64(poly64x2_t a) {
   return vreinterpretq_u64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f64(float64x2_t a) {
   return vreinterpretq_f16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p64(poly64x2_t a) {
   return vreinterpretq_f16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f64(float64x2_t a) {
   return vreinterpretq_f32_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p64(poly64x2_t a) {
   return vreinterpretq_f32_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s8(int8x16_t a) {
   return vreinterpretq_f64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s16(int16x8_t a) {
   return vreinterpretq_f64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s32(int32x4_t a) {
   return vreinterpretq_f64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_s64(int64x2_t a) {
   return vreinterpretq_f64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u8(uint8x16_t a) {
   return vreinterpretq_f64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u16(uint16x8_t a) {
   return vreinterpretq_f64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u32(uint32x4_t a) {
   return vreinterpretq_f64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_u64(uint64x2_t a) {
   return vreinterpretq_f64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f16(float16x8_t a) {
   return vreinterpretq_f64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_f32(float32x4_t a) {
   return vreinterpretq_f64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p8(poly8x16_t a) {
   return vreinterpretq_f64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p16(poly16x8_t a) {
   return vreinterpretq_f64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p64(poly64x2_t a) {
   return vreinterpretq_f64_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f64(float64x2_t a) {
   return vreinterpretq_p8_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p64(poly64x2_t a) {
   return vreinterpretq_p8_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f64(float64x2_t a) {
   return vreinterpretq_p16_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p64(poly64x2_t a) {
   return vreinterpretq_p16_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s8(int8x16_t a) {
   return vreinterpretq_p64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s16(int16x8_t a) {
   return vreinterpretq_p64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_s32(int32x4_t a) {
   return vreinterpretq_p64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_s64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_s64(int64x2_t a) {
   return vreinterpretq_p64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u8(uint8x16_t a) {
   return vreinterpretq_p64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u16(uint16x8_t a) {
   return vreinterpretq_p64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_u32(uint32x4_t a) {
   return vreinterpretq_p64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_u64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 poly64x2_t test_vreinterpretq_p64_u64(uint64x2_t a) {
   return vreinterpretq_p64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f16(float16x8_t a) {
   return vreinterpretq_p64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f32:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f32(float32x4_t a) {
   return vreinterpretq_p64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_f64:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_f64(float64x2_t a) {
   return vreinterpretq_p64_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p8:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p8(poly8x16_t a) {
   return vreinterpretq_p64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p16:
-// CHECK-NEXT: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p16(poly16x8_t a) {
   return vreinterpretq_p64_p16(a);
 }
 
+// CHECK-LABEL: define float @test_vabds_f32(float %a, float %b) #0 {
+// CHECK:   [[VABDS_F32_I:%.*]] = call float @llvm.aarch64.sisd.fabd.f32(float %a, float %b) #4
+// CHECK:   ret float [[VABDS_F32_I]]
 float32_t test_vabds_f32(float32_t a, float32_t b) {
-// CHECK-LABEL: test_vabds_f32
-// CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
   return vabds_f32(a, b);
 }
 
+// CHECK-LABEL: define double @test_vabdd_f64(double %a, double %b) #0 {
+// CHECK:   [[VABDD_F64_I:%.*]] = call double @llvm.aarch64.sisd.fabd.f64(double %a, double %b) #4
+// CHECK:   ret double [[VABDD_F64_I]]
 float64_t test_vabdd_f64(float64_t a, float64_t b) {
-// CHECK-LABEL: test_vabdd_f64
-// CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
   return vabdd_f64(a, b);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vuqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VUQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.suqadd.v1i64(<1 x i64> [[VUQADD_I]], <1 x i64> [[VUQADD1_I]]) #4
+// CHECK:   ret <1 x i64> [[VUQADD2_I]]
 int64x1_t test_vuqadd_s64(int64x1_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vuqadd_s64
   return vuqadd_s64(a, b);
-  // CHECK: suqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSQADD2_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.usqadd.v1i64(<1 x i64> [[VSQADD_I]], <1 x i64> [[VSQADD1_I]]) #4
+// CHECK:   ret <1 x i64> [[VSQADD2_I]]
 uint64x1_t test_vsqadd_u64(uint64x1_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vsqadd_u64
   return vsqadd_u64(a, b);
-  // CHECK: usqadd d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vsqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.usqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSQADD_I]]
 uint8x8_t test_vsqadd_u8(uint8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vsqadd_u8
   return vsqadd_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.usqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSQADD_I]]
 uint8x16_t test_vsqaddq_u8(uint8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsqaddq_u8
   return vsqaddq_u8(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vsqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.usqadd.v4i16(<4 x i16> [[VSQADD_I]], <4 x i16> [[VSQADD1_I]]) #4
+// CHECK:   ret <4 x i16> [[VSQADD2_I]]
 uint16x4_t test_vsqadd_u16(uint16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vsqadd_u16
   return vsqadd_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.usqadd.v8i16(<8 x i16> [[VSQADD_I]], <8 x i16> [[VSQADD1_I]]) #4
+// CHECK:   ret <8 x i16> [[VSQADD2_I]]
 uint16x8_t test_vsqaddq_u16(uint16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsqaddq_u16
   return vsqaddq_u16(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vsqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.usqadd.v2i32(<2 x i32> [[VSQADD_I]], <2 x i32> [[VSQADD1_I]]) #4
+// CHECK:   ret <2 x i32> [[VSQADD2_I]]
 uint32x2_t test_vsqadd_u32(uint32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vsqadd_u32
   return vsqadd_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.usqadd.v4i32(<4 x i32> [[VSQADD_I]], <4 x i32> [[VSQADD1_I]]) #4
+// CHECK:   ret <4 x i32> [[VSQADD2_I]]
 uint32x4_t test_vsqaddq_u32(uint32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsqaddq_u32
   return vsqaddq_u32(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.usqadd.v2i64(<2 x i64> [[VSQADD_I]], <2 x i64> [[VSQADD1_I]]) #4
+// CHECK:   ret <2 x i64> [[VSQADD2_I]]
 uint64x2_t test_vsqaddq_u64(uint64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsqaddq_u64
   return vsqaddq_u64(a, b);
-  // CHECK: usqadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vabs_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.abs.v1i64(<1 x i64> [[VABS_I]]) #4
+// CHECK:   ret <1 x i64> [[VABS1_I]]
 int64x1_t test_vabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vabs_s64
   return vabs_s64(a);
-  // CHECK: abs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqabs_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqabs.v1i64(<1 x i64> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <1 x i64> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vqabs_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqabs_s64
   return vqabs_s64(a);
-  // CHECK: sqabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vqneg_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.sqneg.v1i64(<1 x i64> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <1 x i64> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP1]]
 int64x1_t test_vqneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vqneg_s64
   return vqneg_s64(a);
-  // CHECK: sqneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vneg_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> zeroinitializer, %a
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vneg_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vneg_s64
   return vneg_s64(a);
-  // CHECK: neg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vaddv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VADDV_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VADDV_F32_I]]
 float32_t test_vaddv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vaddv_f32
   return vaddv_f32(a);
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define float @test_vaddvq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VADDVQ_F32_I:%.*]] = call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VADDVQ_F32_I]]
 float32_t test_vaddvq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vaddvq_f32
   return vaddvq_f32(a);
-  // CHECK: faddp {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: faddp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vaddvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VADDVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VADDVQ_F64_I]]
 float64_t test_vaddvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_f64
   return vaddvq_f64(a);
-  // CHECK: faddp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vmaxv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMAXV_F32_I]]
 float32_t test_vmaxv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_f32
   return vmaxv_f32(a);
-  // CHECK: fmaxp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vmaxvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMAXVQ_F64_I]]
 float64_t test_vmaxvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxvq_f64
   return vmaxvq_f64(a);
-  // CHECK: fmaxp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vminv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMINV_F32_I]]
 float32_t test_vminv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminv_f32
   return vminv_f32(a);
-  // CHECK: fminp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vminvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMINVQ_F64_I]]
 float64_t test_vminvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminvq_f64
   return vminvq_f64(a);
-  // CHECK: fminp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define double @test_vmaxnmvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMAXNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fmaxnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMAXNMVQ_F64_I]]
 float64_t test_vmaxnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vmaxnmvq_f64
   return vmaxnmvq_f64(a);
-  // CHECK: fmaxnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vmaxnmv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fmaxnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMAXNMV_F32_I]]
 float32_t test_vmaxnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vmaxnmv_f32
   return vmaxnmv_f32(a);
-  // CHECK: fmaxnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define double @test_vminnmvq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VMINNMVQ_F64_I:%.*]] = call double @llvm.aarch64.neon.fminnmv.f64.v2f64(<2 x double> [[TMP1]]) #4
+// CHECK:   ret double [[VMINNMVQ_F64_I]]
 float64_t test_vminnmvq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vminnmvq_f64
   return vminnmvq_f64(a);
-  // CHECK: fminnmp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define float @test_vminnmv_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNMV_F32_I:%.*]] = call float @llvm.aarch64.neon.fminnmv.f32.v2f32(<2 x float> [[TMP1]]) #4
+// CHECK:   ret float [[VMINNMV_F32_I]]
 float32_t test_vminnmv_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vminnmv_f32
   return vminnmv_f32(a);
-  // CHECK: fminnmp {{s[0-9]+}}, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vpaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_s64
   return vpaddq_s64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VPADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> [[VPADDQ_V_I]], <2 x i64> [[VPADDQ_V1_I]]) #4
+// CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VPADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VPADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vpaddq_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vpaddq_u64
   return vpaddq_u64(a, b);
-  // CHECK: addp {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vpaddd_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADDD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VPADDD_U64_I]]
 uint64_t test_vpaddd_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vpaddd_u64
   return vpaddd_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vaddvq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VADDVQ_S64_I:%.*]] = call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDVQ_S64_I]]
 int64_t test_vaddvq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_s64
   return vaddvq_s64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define i64 @test_vaddvq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VADDVQ_U64_I:%.*]] = call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDVQ_U64_I]]
 uint64_t test_vaddvq_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vaddvq_u64
   return vaddvq_u64(a);
-  // CHECK: addp {{d[0-9]+}}, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x double> @test_vadd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vadd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vadd_f64
   return vadd_f64(a, b);
-  // CHECK: fadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[MUL_I]]
 float64x1_t test_vmul_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmul_f64
   return vmul_f64(a, b);
-  // CHECK: fmul d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vdiv_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[DIV_I:%.*]] = fdiv <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[DIV_I]]
 float64x1_t test_vdiv_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vdiv_f64
   return vdiv_f64(a, b);
-  // CHECK: fdiv d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmla_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[ADD_I]]
 float64x1_t test_vmla_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmla_f64
   return vmla_f64(a, b, c);
-  // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmls_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <1 x double> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, [[MUL_I]]
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vmls_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vmls_f64
   return vmls_f64(a, b, c);
-  // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfma_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
+// CHECK:   ret <1 x double> [[TMP6]]
 float64x1_t test_vfma_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfma_f64
   return vfma_f64(a, b, c);
-  // CHECK: fmadd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vfms_f64(<1 x double> %a, <1 x double> %b, <1 x double> %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[TMP6:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[TMP4]], <1 x double> [[TMP5]], <1 x double> [[TMP3]]) #4
+// CHECK:   ret <1 x double> [[TMP6]]
 float64x1_t test_vfms_f64(float64x1_t a, float64x1_t b, float64x1_t c) {
-  // CHECK-LABEL: test_vfms_f64
   return vfms_f64(a, b, c);
-  // CHECK: fmsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vsub_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> %a, %b
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vsub_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vsub_f64
   return vsub_f64(a, b);
-  // CHECK: fsub d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vabd_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VABD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VABD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VABD2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fabd.v1f64(<1 x double> [[VABD_I]], <1 x double> [[VABD1_I]]) #4
+// CHECK:   ret <1 x double> [[VABD2_I]]
 float64x1_t test_vabd_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vabd_f64
   return vabd_f64(a, b);
-  // CHECK: fabd d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmax_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMAX1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMAX2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmax.v1f64(<1 x double> [[VMAX_I]], <1 x double> [[VMAX1_I]]) #4
+// CHECK:   ret <1 x double> [[VMAX2_I]]
 float64x1_t test_vmax_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmax_f64
   return vmax_f64(a, b);
-// CHECK: fmax d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmin_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMIN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMIN1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMIN2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmin.v1f64(<1 x double> [[VMIN_I]], <1 x double> [[VMIN1_I]]) #4
+// CHECK:   ret <1 x double> [[VMIN2_I]]
 float64x1_t test_vmin_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmin_f64
   return vmin_f64(a, b);
-// CHECK: fmin d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmaxnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMAXNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMAXNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMAXNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fmaxnm.v1f64(<1 x double> [[VMAXNM_I]], <1 x double> [[VMAXNM1_I]]) #4
+// CHECK:   ret <1 x double> [[VMAXNM2_I]]
 float64x1_t test_vmaxnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmaxnm_f64
   return vmaxnm_f64(a, b);
-// CHECK: fmaxnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vminnm_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VMINNM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VMINNM1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VMINNM2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.fminnm.v1f64(<1 x double> [[VMINNM_I]], <1 x double> [[VMINNM1_I]]) #4
+// CHECK:   ret <1 x double> [[VMINNM2_I]]
 float64x1_t test_vminnm_f64(float64x1_t a, float64x1_t b) {
-// CHECK-LABEL: test_vminnm_f64
   return vminnm_f64(a, b);
-// CHECK: fminnm d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vabs_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VABS1_I:%.*]] = call <1 x double> @llvm.fabs.v1f64(<1 x double> [[VABS_I]]) #4
+// CHECK:   ret <1 x double> [[VABS1_I]]
 float64x1_t test_vabs_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vabs_f64
   return vabs_f64(a);
-  // CHECK: fabs d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vneg_f64(<1 x double> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <1 x double> <double -0.000000e+00>, %a
+// CHECK:   ret <1 x double> [[SUB_I]]
 float64x1_t test_vneg_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vneg_f64
   return vneg_f64(a);
-  // CHECK: fneg d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fptosi <1 x double> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vcvt_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_s64_f64
   return vcvt_s64_f64(a);
-  // CHECK: fcvtzs {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fptoui <1 x double> [[TMP1]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vcvt_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_u64_f64
   return vcvt_u64_f64(a);
-  // CHECK: fcvtzu {{[xd][0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtn_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtns.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 int64x1_t test_vcvtn_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_s64_f64
   return vcvtn_s64_f64(a);
-  // CHECK: fcvtns d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtn_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtnu.v1i64.v1f64(<1 x double> [[VCVTN_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTN1_I]]
 uint64x1_t test_vcvtn_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtn_u64_f64
   return vcvtn_u64_f64(a);
-  // CHECK: fcvtnu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtp_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtps.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 int64x1_t test_vcvtp_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_s64_f64
   return vcvtp_s64_f64(a);
-  // CHECK: fcvtps d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtp_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtpu.v1i64.v1f64(<1 x double> [[VCVTP_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTP1_I]]
 uint64x1_t test_vcvtp_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtp_u64_f64
   return vcvtp_u64_f64(a);
-  // CHECK: fcvtpu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtm_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtms.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 int64x1_t test_vcvtm_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_s64_f64
   return vcvtm_s64_f64(a);
-  // CHECK: fcvtms d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvtm_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtmu.v1i64.v1f64(<1 x double> [[VCVTM_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTM1_I]]
 uint64x1_t test_vcvtm_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvtm_u64_f64
   return vcvtm_u64_f64(a);
-  // CHECK: fcvtmu d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvta_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtas.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 int64x1_t test_vcvta_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_s64_f64
   return vcvta_s64_f64(a);
-  // CHECK: fcvtas d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvta_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.fcvtau.v1i64.v1f64(<1 x double> [[VCVTA_I]]) #4
+// CHECK:   ret <1 x i64> [[VCVTA1_I]]
 uint64x1_t test_vcvta_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvta_u64_f64
   return vcvta_u64_f64(a);
-  // CHECK: fcvtau d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_s64
   return vcvt_f64_s64(a);
-  // CHECK: scvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <1 x i64> [[TMP1]] to <1 x double>
+// CHECK:   ret <1 x double> [[VCVT_I]]
 float64x1_t test_vcvt_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_f64_u64
   return vcvt_f64_u64(a);
-  // CHECK: ucvtf d{{[0-9]+}}, {{[xd][0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N1]]
 int64x1_t test_vcvt_n_s64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_s64_f64
   return vcvt_n_s64_f64(a, 64);
-  // CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x i64> @llvm.aarch64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x i64> [[VCVT_N1]]
 uint64x1_t test_vcvt_n_u64_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_u64_f64
   return vcvt_n_u64_f64(a, 64);
-  // CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N1]]
 float64x1_t test_vcvt_n_f64_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_s64
   return vcvt_n_f64_s64(a, 64);
-  // CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VCVT_N1:%.*]] = call <1 x double> @llvm.aarch64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> [[VCVT_N]], i32 64)
+// CHECK:   ret <1 x double> [[VCVT_N1]]
 float64x1_t test_vcvt_n_f64_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vcvt_n_f64_u64
   return vcvt_n_f64_u64(a, 64);
-  // CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndn_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDN1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frintn.v1f64(<1 x double> [[VRNDN_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDN1_I]]
 float64x1_t test_vrndn_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndn_f64
   return vrndn_f64(a);
-  // CHECK: frintn d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrnda_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDA1_I:%.*]] = call <1 x double> @llvm.round.v1f64(<1 x double> [[VRNDA_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDA1_I]]
 float64x1_t test_vrnda_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnda_f64
   return vrnda_f64(a);
-  // CHECK: frinta d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndp_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDP1_I:%.*]] = call <1 x double> @llvm.ceil.v1f64(<1 x double> [[VRNDP_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDP1_I]]
 float64x1_t test_vrndp_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndp_f64
   return vrndp_f64(a);
-  // CHECK: frintp d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndm_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDM1_I:%.*]] = call <1 x double> @llvm.floor.v1f64(<1 x double> [[VRNDM_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDM1_I]]
 float64x1_t test_vrndm_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndm_f64
   return vrndm_f64(a);
-  // CHECK: frintm d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndx_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDX1_I:%.*]] = call <1 x double> @llvm.rint.v1f64(<1 x double> [[VRNDX_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDX1_I]]
 float64x1_t test_vrndx_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndx_f64
   return vrndx_f64(a);
-  // CHECK: frintx d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrnd_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <1 x double> @llvm.trunc.v1f64(<1 x double> [[VRNDZ_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDZ1_I]]
 float64x1_t test_vrnd_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrnd_f64
   return vrnd_f64(a);
-  // CHECK: frintz d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrndi_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRNDI1_I:%.*]] = call <1 x double> @llvm.nearbyint.v1f64(<1 x double> [[VRNDI_I]]) #4
+// CHECK:   ret <1 x double> [[VRNDI1_I]]
 float64x1_t test_vrndi_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrndi_f64
   return vrndi_f64(a);
-  // CHECK: frinti d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrsqrte_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrte.v1f64(<1 x double> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <1 x double> [[VRSQRTE_V1_I]]
 float64x1_t test_vrsqrte_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrsqrte_f64
   return vrsqrte_f64(a);
-  // CHECK: frsqrte d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrecpe_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecpe.v1f64(<1 x double> [[VRECPE_V_I]]) #4
+// CHECK:   ret <1 x double> [[VRECPE_V1_I]]
 float64x1_t test_vrecpe_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vrecpe_f64
   return vrecpe_f64(a);
-  // CHECK: frecpe d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vsqrt_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VSQRT_I:%.*]] = call <1 x double> @llvm.sqrt.v1f64(<1 x double> [[TMP1]]) #4
+// CHECK:   ret <1 x double> [[VSQRT_I]]
 float64x1_t test_vsqrt_f64(float64x1_t a) {
-  // CHECK-LABEL: test_vsqrt_f64
   return vsqrt_f64(a);
-  // CHECK: fsqrt d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrecps_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frecps.v1f64(<1 x double> [[VRECPS_V_I]], <1 x double> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <1 x double> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vrecps_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrecps_f64
   return vrecps_f64(a, b);
-  // CHECK: frecps d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x double> @test_vrsqrts_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <1 x double> @llvm.aarch64.neon.frsqrts.v1f64(<1 x double> [[VRSQRTS_V_I]], <1 x double> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <1 x double> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP2]]
 float64x1_t test_vrsqrts_f64(float64x1_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vrsqrts_f64
   return vrsqrts_f64(a, b);
-  // CHECK: frsqrts d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
 }
 
+// CHECK-LABEL: define i32 @test_vminv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMINV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMINV_S32_I]]
 int32_t test_vminv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vminv_s32
   return vminv_s32(a);
-  // CHECK: sminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vminv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMINV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMINV_U32_I]]
 uint32_t test_vminv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vminv_u32
   return vminv_u32(a);
-  // CHECK: uminp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vmaxv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAXV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMAXV_S32_I]]
 int32_t test_vmaxv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_s32
   return vmaxv_s32(a);
-  // CHECK: smaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vmaxv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAXV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VMAXV_U32_I]]
 uint32_t test_vmaxv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmaxv_u32
   return vmaxv_u32(a);
-  // CHECK: umaxp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vaddv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDV_S32_I:%.*]] = call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VADDV_S32_I]]
 int32_t test_vaddv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddv_s32
   return vaddv_s32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i32 @test_vaddv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDV_U32_I:%.*]] = call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i32 [[VADDV_U32_I]]
 uint32_t test_vaddv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddv_u32
   return vaddv_u32(a);
-  // CHECK: addp {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i64 @test_vaddlv_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDLV_S32_I:%.*]] = call i64 @llvm.aarch64.neon.saddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDLV_S32_I]]
 int64_t test_vaddlv_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_s32
   return vaddlv_s32(a);
-  // CHECK: saddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define i64 @test_vaddlv_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VADDLV_U32_I:%.*]] = call i64 @llvm.aarch64.neon.uaddlv.i64.v2i32(<2 x i32> [[TMP1]]) #4
+// CHECK:   ret i64 [[VADDLV_U32_I]]
 uint64_t test_vaddlv_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vaddlv_u32
   return vaddlv_u32(a);
-  // CHECK: uaddlp {{v[0-9]+}}.1d, {{v[0-9]+}}.2s
 }
diff --git a/test/CodeGen/aarch64-neon-ldst-one.c b/test/CodeGen/aarch64-neon-ldst-one.c
index dc888c2cf72d7..25bd797b92768 100644
--- a/test/CodeGen/aarch64-neon-ldst-one.c
+++ b/test/CodeGen/aarch64-neon-ldst-one.c
@@ -1,2049 +1,7977 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u8
   return vld1q_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u16
   return vld1q_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u32
   return vld1q_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_u64
   return vld1q_dup_u64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s8
   return vld1q_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s16
   return vld1q_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s32
   return vld1q_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_s64
   return vld1q_dup_s64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f16
   return vld1q_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f32
   return vld1q_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[LANE]]
 float64x2_t test_vld1q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_f64
   return vld1q_dup_f64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p8
   return vld1q_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p16
   return vld1q_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 poly64x2_t test_vld1q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1q_dup_p64
   return vld1q_dup_p64(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u8
   return vld1_dup_u8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u16
   return vld1_dup_u16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u32
   return vld1_dup_u32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_u64
   return vld1_dup_u64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s8
   return vld1_dup_s8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s16
   return vld1_dup_s16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s32
   return vld1_dup_s32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_s64
   return vld1_dup_s64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f16
   return vld1_dup_f16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f32
   return vld1_dup_f32(a);
-  // CHECK: ld1r {{{ *v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_dup_f64(double* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP2:%.*]] = load double, double* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x double> undef, double [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[LANE]]
 float64x1_t test_vld1_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_f64
   return vld1_dup_f64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p8
   return vld1_dup_p8(a);
-  // CHECK: ld1r {{{ *v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p16
   return vld1_dup_p16(a);
-  // CHECK: ld1r {{{ *v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_p64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]]
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 poly64x1_t test_vld1_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld1_dup_p64
   return vld1_dup_p64(a);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP4]]
 uint8x16x2_t test_vld2q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u8
   return vld2q_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP6]]
 uint16x8x2_t test_vld2q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u16
   return vld2q_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP6]]
 uint32x4x2_t test_vld2q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u32
   return vld2q_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP6]]
 uint64x2x2_t test_vld2q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_u64
   return vld2q_dup_u64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP4]]
 int8x16x2_t test_vld2q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s8
   return vld2q_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP6]]
 int16x8x2_t test_vld2q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s16
   return vld2q_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2]], { <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP6]]
 int32x4x2_t test_vld2q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s32
   return vld2q_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP6]]
 int64x2x2_t test_vld2q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_s64
   return vld2q_dup_s64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP6]]
 float16x8x2_t test_vld2q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f16
   return vld2q_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2]], { <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP6]]
 float32x4x2_t test_vld2q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f32
   return vld2q_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2]], { <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP6]]
 float64x2x2_t test_vld2q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_f64
   return vld2q_dup_f64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP4]]
 poly8x16x2_t test_vld2q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p8
   return vld2q_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2]], { <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP6]]
 poly16x8x2_t test_vld2q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p16
   return vld2q_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2q_dup_p64
   return vld2q_dup_p64(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP4]]
 uint8x8x2_t test_vld2_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u8
   return vld2_dup_u8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP6]]
 uint16x4x2_t test_vld2_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u16
   return vld2_dup_u16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP6]]
 uint32x2x2_t test_vld2_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u32
   return vld2_dup_u32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP6]]
 uint64x1x2_t test_vld2_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_u64
   return vld2_dup_u64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP4]]
 int8x8x2_t test_vld2_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s8
   return vld2_dup_s8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP6]]
 int16x4x2_t test_vld2_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s16
   return vld2_dup_s16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2]], { <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP6]]
 int32x2x2_t test_vld2_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s32
   return vld2_dup_s32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP6]]
 int64x1x2_t test_vld2_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_s64
   return vld2_dup_s64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP6]]
 float16x4x2_t test_vld2_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f16
   return vld2_dup_f16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD2:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2]], { <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP6]]
 float32x2x2_t test_vld2_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f32
   return vld2_dup_f32(a);
-  // CHECK: ld2r {{{ *v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD2:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2]], { <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP6]]
 float64x1x2_t test_vld2_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_f64
   return vld2_dup_f64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP4]]
 poly8x8x2_t test_vld2_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p8
   return vld2_dup_p8(a);
-  // CHECK: ld2r {{{ *v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD2:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2]], { <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP6]]
 poly16x4x2_t test_vld2_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p16
   return vld2_dup_p16(a);
-  // CHECK: ld2r {{{ *v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld2_dup_p64
   return vld2_dup_p64(a);
-  // CHECK: {{ld1|ld2r}} {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP4]]
 uint8x16x3_t test_vld3q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u8
   return vld3q_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP6]]
 uint16x8x3_t test_vld3q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u16
   return vld3q_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP6]]
 uint32x4x3_t test_vld3q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u32
   return vld3q_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP6]]
 uint64x2x3_t test_vld3q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_u64
   return vld3q_dup_u64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP4]]
 int8x16x3_t test_vld3q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s8
   return vld3q_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP6]]
 int16x8x3_t test_vld3q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s16
   return vld3q_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP6]]
 int32x4x3_t test_vld3q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s32
   return vld3q_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP6]]
 int64x2x3_t test_vld3q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_s64
   return vld3q_dup_s64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP6]]
 float16x8x3_t test_vld3q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f16
   return vld3q_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP6]]
 float32x4x3_t test_vld3q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f32
   return vld3q_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP6]]
 float64x2x3_t test_vld3q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_f64
   return vld3q_dup_f64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP4]]
 poly8x16x3_t test_vld3q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p8
   return vld3q_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP6]]
 poly16x8x3_t test_vld3q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p16
   return vld3q_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3q_dup_p64
   return vld3q_dup_p64(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP4]]
 uint8x8x3_t test_vld3_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u8
   return vld3_dup_u8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP6]]
 uint16x4x3_t test_vld3_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u16
   return vld3_dup_u16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP6]]
 uint32x2x3_t test_vld3_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u32
   return vld3_dup_u32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP6]]
 uint64x1x3_t test_vld3_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_u64
   return vld3_dup_u64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP4]]
 int8x8x3_t test_vld3_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s8
   return vld3_dup_s8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP6]]
 int16x4x3_t test_vld3_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s16
   return vld3_dup_s16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP6]]
 int32x2x3_t test_vld3_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s32
   return vld3_dup_s32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP6]]
 int64x1x3_t test_vld3_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_s64
   return vld3_dup_s64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP6]]
 float16x4x3_t test_vld3_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f16
   return vld3_dup_f16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD3:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP6]]
 float32x2x3_t test_vld3_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f32
   return vld3_dup_f32(a);
-  // CHECK: ld3r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD3:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP6]]
 float64x1x3_t test_vld3_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_f64
   return vld3_dup_f64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP4]]
 poly8x8x3_t test_vld3_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p8
   return vld3_dup_p8(a);
-  // CHECK: ld3r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD3:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP6]]
 poly16x4x3_t test_vld3_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p16
   return vld3_dup_p16(a);
-  // CHECK: ld3r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld3_dup_p64
   return vld3_dup_p64(a);
-  // CHECK: {{ld1|ld3r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}},
   // [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP4]]
 uint8x16x4_t test_vld4q_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u8
   return vld4q_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP6]]
 uint16x8x4_t test_vld4q_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u16
   return vld4q_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP6]]
 uint32x4x4_t test_vld4q_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u32
   return vld4q_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP6]]
 uint64x2x4_t test_vld4q_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_u64
   return vld4q_dup_u64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP4]]
 int8x16x4_t test_vld4q_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s8
   return vld4q_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP6]]
 int16x8x4_t test_vld4q_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s16
   return vld4q_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4r.v4i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP6]]
 int32x4x4_t test_vld4q_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s32
   return vld4q_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP6]]
 int64x2x4_t test_vld4q_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_s64
   return vld4q_dup_s64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP6]]
 float16x8x4_t test_vld4q_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f16
   return vld4q_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4r.v4f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP6]]
 float32x4x4_t test_vld4q_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f32
   return vld4q_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4r.v2f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP6]]
 float64x2x4_t test_vld4q_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_f64
   return vld4q_dup_f64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4r.v16i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP4]]
 poly8x16x4_t test_vld4q_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p8
   return vld4q_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4r.v8i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP6]]
 poly16x8x4_t test_vld4q_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p16
   return vld4q_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4r.v2i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4q_dup_p64
   return vld4q_dup_p64(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_dup_u8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP4]]
 uint8x8x4_t test_vld4_dup_u8(uint8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u8
   return vld4_dup_u8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_dup_u16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP6]]
 uint16x4x4_t test_vld4_dup_u16(uint16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u16
   return vld4_dup_u16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_dup_u32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP6]]
 uint32x2x4_t test_vld4_dup_u32(uint32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u32
   return vld4_dup_u32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_dup_u64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP6]]
 uint64x1x4_t test_vld4_dup_u64(uint64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_u64
   return vld4_dup_u64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP4]]
 int8x8x4_t test_vld4_dup_s8(int8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s8
   return vld4_dup_s8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP6]]
 int16x4x4_t test_vld4_dup_s16(int16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s16
   return vld4_dup_s16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i32*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4r.v2i32.p0i32(i32* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP6]]
 int32x2x4_t test_vld4_dup_s32(int32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s32
   return vld4_dup_s32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP6]]
 int64x1x4_t test_vld4_dup_s64(int64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_s64
   return vld4_dup_s64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_dup_f16(half* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP6]]
 float16x4x4_t test_vld4_dup_f16(float16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f16
   return vld4_dup_f16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to float*
+// CHECK:   [[VLD4:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4r.v2f32.p0f32(float* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP6]]
 float32x2x4_t test_vld4_dup_f32(float32_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f32
   return vld4_dup_f32(a);
-  // CHECK: ld4r {{{ *v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s, v[0-9]+.2s *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to double*
+// CHECK:   [[VLD4:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4r.v1f64.p0f64(double* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP6]]
 float64x1x4_t test_vld4_dup_f64(float64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_f64
   return vld4_dup_f64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_dup_p8(i8* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4r.v8i8.p0i8(i8* %a)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP2]], i8* [[TMP3]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP4:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP4]]
 poly8x8x4_t test_vld4_dup_p8(poly8_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p8
   return vld4_dup_p8(a);
-  // CHECK: ld4r {{{ *v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_dup_p16(i16* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i16*
+// CHECK:   [[VLD4:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4r.v4i16.p0i16(i16* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP6]]
 poly16x4x4_t test_vld4_dup_p16(poly16_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p16
   return vld4_dup_p16(a);
-  // CHECK: ld4r {{{ *v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h, v[0-9]+.4h *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_dup_p64(i64* %a) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to i64*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4r.v1i64.p0i64(i64* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_dup_p64(poly64_t  *a) {
-  // CHECK-LABEL: test_vld4_dup_p64
   return vld4_dup_p64(a);
-  // CHECK: {{ld1|ld4r}} {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u8
   return vld1q_lane_u8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u16
   return vld1q_lane_u16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u32
   return vld1q_lane_u32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_u64
   return vld1q_lane_u64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s8
   return vld1q_lane_s8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s16
   return vld1q_lane_s16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s32
   return vld1q_lane_s32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_s64
   return vld1q_lane_s64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f16
   return vld1q_lane_f16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f32
   return vld1q_lane_f32(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP4]], i32 1
+// CHECK:   ret <2 x double> [[VLD1_LANE]]
 float64x2_t test_vld1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_f64
   return vld1q_lane_f64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p8
   return vld1q_lane_p8(a, b, 15);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p16
   return vld1q_lane_p16(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP4]], i32 1
+// CHECK:   ret <2 x i64> [[VLD1_LANE]]
 poly64x2_t test_vld1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vld1q_lane_p64
   return vld1q_lane_p64(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_u8
   return vld1_lane_u8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_u16
   return vld1_lane_u16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_u32
   return vld1_lane_u32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_u64
   return vld1_lane_u64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_s8
   return vld1_lane_s8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_s16
   return vld1_lane_s16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_s32
   return vld1_lane_s32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_s64
   return vld1_lane_s64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_f16
   return vld1_lane_f16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vld1_lane_f32
   return vld1_lane_f32(a, b, 1);
-  // CHECK: ld1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   [[TMP4:%.*]] = load double, double* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x double> [[TMP2]], double [[TMP4]], i32 0
+// CHECK:   ret <1 x double> [[VLD1_LANE]]
 float64x1_t test_vld1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_f64
   return vld1_lane_f64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vld1_lane_p8
   return vld1_lane_p8(a, b, 7);
-  // CHECK: ld1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vld1_lane_p16
   return vld1_lane_p16(a, b, 3);
-  // CHECK: ld1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]]
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 poly64x1_t test_vld1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vld1_lane_p64
   return vld1_lane_p64(a, b, 0);
-  // CHECK: {{ld1r { v[0-9]+.1d }|ldr d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vld2q_lane_s8(int8_t const * ptr, int8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_s8
   return vld2q_lane_s8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b,  v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vld2q_lane_u8(uint8_t const * ptr, uint8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_u8
   return vld2q_lane_u8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* %ptr, [2 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[SRC]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VLD2_LANE:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %ptr)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2_LANE]], { <16 x i8>, <16 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vld2q_lane_p8(poly8_t const * ptr, poly8x16x2_t src) {
-  // CHECK-LABEL: test_vld2q_lane_p8
   return vld2q_lane_p8(ptr, src, 15);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x16x3_t, %struct.int8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x3_t [[TMP9]]
 int8x16x3_t test_vld3q_lane_s8(int8_t const * ptr, int8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_s8
   return vld3q_lane_s8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* %ptr, [3 x <16 x i8>] %src.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[SRC:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[SRC]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[SRC]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[SRC]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %ptr)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x3_t [[TMP9]]
 uint8x16x3_t test_vld3q_lane_u8(uint8_t const * ptr, uint8x16x3_t src) {
-  // CHECK-LABEL: test_vld3q_lane_u8
   return vld3q_lane_u8(ptr, src, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [x0]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vld2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP13]]
 uint16x8x2_t test_vld2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u16
   return vld2q_lane_u16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vld2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP13]]
 uint32x4x2_t test_vld2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u32
   return vld2q_lane_u32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x2_t @test_vld2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x2_t [[TMP13]]
 uint64x2x2_t test_vld2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_u64
   return vld2q_lane_u64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP13]]
 int16x8x2_t test_vld2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s16
   return vld2q_lane_s16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2lane.v4i32.p0i8(<4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2_LANE]], { <4 x i32>, <4 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP13]]
 int32x4x2_t test_vld2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s32
   return vld2q_lane_s32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x2x2_t, %struct.int64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x2_t [[TMP13]]
 int64x2x2_t test_vld2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_s64
   return vld2q_lane_s64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x2_t @test_vld2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x8x2_t, %struct.float16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x2_t [[TMP13]]
 float16x8x2_t test_vld2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f16
   return vld2q_lane_f16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2lane.v4f32.p0i8(<4 x float> [[TMP8]], <4 x float> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2_LANE]], { <4 x float>, <4 x float> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP13]]
 float32x4x2_t test_vld2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f32
   return vld2q_lane_f32(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2lane.v2f64.p0i8(<2 x double> [[TMP8]], <2 x double> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double> } [[VLD2_LANE]], { <2 x double>, <2 x double> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x2x2_t, %struct.float64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x2_t [[TMP13]]
 float64x2x2_t test_vld2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_f64
   return vld2q_lane_f64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vld2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2lane.v8i16.p0i8(<8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2_LANE]], { <8 x i16>, <8 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP13]]
 poly16x8x2_t test_vld2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p16
   return vld2q_lane_p16(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2lane.v2i64.p0i8(<2 x i64> [[TMP8]], <2 x i64> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2_LANE]], { <2 x i64>, <2 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP13]]
 poly64x2x2_t test_vld2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vld2q_lane_p64
   return vld2q_lane_p64(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vld2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vld2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u8
   return vld2_lane_u8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vld2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP13]]
 uint16x4x2_t test_vld2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u16
   return vld2_lane_u16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vld2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP13]]
 uint32x2x2_t test_vld2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u32
   return vld2_lane_u32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x2_t @test_vld2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x2_t [[TMP13]]
 uint64x1x2_t test_vld2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_u64
   return vld2_lane_u64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vld2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s8
   return vld2_lane_s8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP13]]
 int16x4x2_t test_vld2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s16
   return vld2_lane_s16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2lane.v2i32.p0i8(<2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE]], { <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP13]]
 int32x2x2_t test_vld2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s32
   return vld2_lane_s32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.int64x1x2_t, %struct.int64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x2_t [[TMP13]]
 int64x1x2_t test_vld2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_s64
   return vld2_lane_s64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x2_t @test_vld2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float16x4x2_t, %struct.float16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x2_t [[TMP13]]
 float16x4x2_t test_vld2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f16
   return vld2_lane_f16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <2 x float>, <2 x float> } @llvm.aarch64.neon.ld2lane.v2f32.p0i8(<2 x float> [[TMP8]], <2 x float> [[TMP9]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE]], { <2 x float>, <2 x float> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP13]]
 float32x2x2_t test_vld2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f32
   return vld2_lane_f32(a, b, 1);
-  // CHECK: ld2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x double>, <1 x double> } @llvm.aarch64.neon.ld2lane.v1f64.p0i8(<1 x double> [[TMP8]], <1 x double> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double> } [[VLD2_LANE]], { <1 x double>, <1 x double> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.float64x1x2_t, %struct.float64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x2_t [[TMP13]]
 float64x1x2_t test_vld2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_f64
   return vld2_lane_f64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vld2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld2lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP6]], i8* [[TMP7]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vld2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p8
   return vld2_lane_p8(a, b, 7);
-  // CHECK: ld2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vld2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld2lane.v4i16.p0i8(<4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE]], { <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP13]]
 poly16x4x2_t test_vld2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p16
   return vld2_lane_p16(a, b, 3);
-  // CHECK: ld2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[VLD2_LANE:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2lane.v1i64.p0i8(<1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_LANE]], { <1 x i64>, <1 x i64> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP11]], i8* [[TMP12]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP13:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP13]]
 poly64x1x2_t test_vld2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vld2_lane_p64
   return vld2_lane_p64(a, b, 0);
-  // CHECK: ld2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x3_t @test_vld3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x3_t [[TMP16]]
 uint16x8x3_t test_vld3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u16
   return vld3q_lane_u16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x3_t @test_vld3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x3_t [[TMP16]]
 uint32x4x3_t test_vld3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u32
   return vld3q_lane_u32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x3_t @test_vld3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x3_t [[TMP16]]
 uint64x2x3_t test_vld3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_u64
   return vld3q_lane_u64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x8x3_t, %struct.int16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x3_t [[TMP16]]
 int16x8x3_t test_vld3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s16
   return vld3q_lane_s16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3lane.v4i32.p0i8(<4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3_LANE]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x4x3_t, %struct.int32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x3_t [[TMP16]]
 int32x4x3_t test_vld3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s32
   return vld3q_lane_s32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x2x3_t, %struct.int64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x3_t [[TMP16]]
 int64x2x3_t test_vld3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_s64
   return vld3q_lane_s64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x3_t @test_vld3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x8x3_t, %struct.float16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x3_t [[TMP16]]
 float16x8x3_t test_vld3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f16
   return vld3q_lane_f16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3lane.v4f32.p0i8(<4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3_LANE]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x4x3_t, %struct.float32x4x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x3_t [[TMP16]]
 float32x4x3_t test_vld3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f32
   return vld3q_lane_f32(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld3lane.v2f64.p0i8(<2 x double> [[TMP10]], <2 x double> [[TMP11]], <2 x double> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double> } [[VLD3_LANE]], { <2 x double>, <2 x double>, <2 x double> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x2x3_t, %struct.float64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x3_t [[TMP16]]
 float64x2x3_t test_vld3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_f64
   return vld3q_lane_f64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x3_t @test_vld3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VLD3_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld3lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3_LANE]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x16x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x3_t [[TMP9]]
 poly8x16x3_t test_vld3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p8
   return vld3q_lane_p8(a, b, 15);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x3_t @test_vld3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld3lane.v8i16.p0i8(<8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3_LANE]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x3_t [[TMP16]]
 poly16x8x3_t test_vld3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p16
   return vld3q_lane_p16(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3lane.v2i64.p0i8(<2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <2 x i64> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3_LANE]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP16]]
 poly64x2x3_t test_vld3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vld3q_lane_p64
   return vld3q_lane_p64(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x3_t @test_vld3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x3_t [[TMP9]]
 uint8x8x3_t test_vld3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u8
   return vld3_lane_u8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x3_t @test_vld3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x3_t [[TMP16]]
 uint16x4x3_t test_vld3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u16
   return vld3_lane_u16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x3_t @test_vld3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x3_t [[TMP16]]
 uint32x2x3_t test_vld3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u32
   return vld3_lane_u32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x3_t @test_vld3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.uint64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x3_t [[TMP16]]
 uint64x1x3_t test_vld3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_u64
   return vld3_lane_u64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.int8x8x3_t, %struct.int8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x3_t [[TMP9]]
 int8x8x3_t test_vld3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s8
   return vld3_lane_s8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int16x4x3_t, %struct.int16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x3_t [[TMP16]]
 int16x4x3_t test_vld3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s16
   return vld3_lane_s16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld3lane.v2i32.p0i8(<2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int32x2x3_t, %struct.int32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x3_t [[TMP16]]
 int32x2x3_t test_vld3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s32
   return vld3_lane_s32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.int64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.int64x1x3_t, %struct.int64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x3_t [[TMP16]]
 int64x1x3_t test_vld3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_s64
   return vld3_lane_s64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x3_t @test_vld3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float16x4x3_t, %struct.float16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x3_t [[TMP16]]
 float16x4x3_t test_vld3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f16
   return vld3_lane_f16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld3lane.v2f32.p0i8(<2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float32x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float32x2x3_t, %struct.float32x2x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x3_t [[TMP16]]
 float32x2x3_t test_vld3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f32
   return vld3_lane_f32(a, b, 1);
-  // CHECK: ld3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld3lane.v1f64.p0i8(<1 x double> [[TMP10]], <1 x double> [[TMP11]], <1 x double> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double> } [[VLD3_LANE]], { <1 x double>, <1 x double>, <1 x double> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.float64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.float64x1x3_t, %struct.float64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x3_t [[TMP16]]
 float64x1x3_t test_vld3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_f64
   return vld3_lane_f64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x3_t @test_vld3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld3lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP9:%.*]] = load %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x3_t [[TMP9]]
 poly8x8x3_t test_vld3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p8
   return vld3_lane_p8(a, b, 7);
-  // CHECK: ld3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x3_t @test_vld3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3lane.v4i16.p0i8(<4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly16x4x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x3_t [[TMP16]]
 poly16x4x3_t test_vld3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p16
   return vld3_lane_p16(a, b, 3);
-  // CHECK: ld3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[VLD3_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3lane.v1i64.p0i8(<1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP13:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_LANE]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP13]]
+// CHECK:   [[TMP14:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP14]], i8* [[TMP15]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP16:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP16]]
 poly64x1x3_t test_vld3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vld3_lane_p64
   return vld3_lane_p64(a, b, 0);
-  // CHECK: ld3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x16x4_t @test_vld4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x4_t [[TMP10]]
 uint8x16x4_t test_vld4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u8
   return vld4q_lane_u8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x8x4_t @test_vld4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x4_t [[TMP19]]
 uint16x8x4_t test_vld4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u16
   return vld4q_lane_u16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x4x4_t @test_vld4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x4_t [[TMP19]]
 uint32x4x4_t test_vld4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u32
   return vld4q_lane_u32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x2x4_t @test_vld4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint64x2x4_t [[TMP19]]
 uint64x2x4_t test_vld4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_u64
   return vld4q_lane_u64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x16x4_t, %struct.int8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x4_t [[TMP10]]
 int8x16x4_t test_vld4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s8
   return vld4q_lane_s8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x8x4_t, %struct.int16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x4_t [[TMP19]]
 int16x8x4_t test_vld4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s16
   return vld4q_lane_s16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4lane.v4i32.p0i8(<4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4_LANE]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x4x4_t, %struct.int32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x4_t [[TMP19]]
 int32x4x4_t test_vld4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s32
   return vld4q_lane_s32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x2x4_t, %struct.int64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int64x2x4_t [[TMP19]]
 int64x2x4_t test_vld4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_s64
   return vld4q_lane_s64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x8x4_t @test_vld4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x8x4_t, %struct.float16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float16x8x4_t [[TMP19]]
 float16x8x4_t test_vld4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f16
   return vld4q_lane_f16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld4lane.v4f32.p0i8(<4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4_LANE]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x4x4_t, %struct.float32x4x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x4_t [[TMP19]]
 float32x4x4_t test_vld4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f32
   return vld4q_lane_f32(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x double> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x double> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x double> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.aarch64.neon.ld4lane.v2f64.p0i8(<2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], <2 x double> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x double>, <2 x double>, <2 x double>, <2 x double> }*
+// CHECK:   store { <2 x double>, <2 x double>, <2 x double>, <2 x double> } [[VLD4_LANE]], { <2 x double>, <2 x double>, <2 x double>, <2 x double> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x2x4_t, %struct.float64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float64x2x4_t [[TMP19]]
 float64x2x4_t test_vld4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_f64
   return vld4q_lane_f64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x16x4_t @test_vld4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[VLD4_LANE:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4lane.v16i8.p0i8(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i64 15, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4_LANE]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x16x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x4_t [[TMP10]]
 poly8x16x4_t test_vld4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p8
   return vld4q_lane_p8(a, b, 15);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x8x4_t @test_vld4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld4lane.v8i16.p0i8(<8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i64 7, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4_LANE]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x4_t [[TMP19]]
 poly16x8x4_t test_vld4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p16
   return vld4q_lane_p16(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i64> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i64> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i64> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <2 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <2 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4lane.v2i64.p0i8(<2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i64> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4_LANE]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP19]]
 poly64x2x4_t test_vld4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vld4q_lane_p64
   return vld4q_lane_p64(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint8x8x4_t @test_vld4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x4_t [[TMP10]]
 uint8x8x4_t test_vld4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u8
   return vld4_lane_u8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint16x4x4_t @test_vld4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x4_t [[TMP19]]
 uint16x4x4_t test_vld4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u16
   return vld4_lane_u16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint32x2x4_t @test_vld4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x4_t [[TMP19]]
 uint32x2x4_t test_vld4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u32
   return vld4_lane_u32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.uint64x1x4_t @test_vld4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.uint64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint64x1x4_t [[TMP19]]
 uint64x1x4_t test_vld4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_u64
   return vld4_lane_u64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.int8x8x4_t, %struct.int8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x4_t [[TMP10]]
 int8x8x4_t test_vld4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s8
   return vld4_lane_s8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int16x4x4_t, %struct.int16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x4_t [[TMP19]]
 int16x4x4_t test_vld4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s16
   return vld4_lane_s16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld4lane.v2i32.p0i8(<2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int32x2x4_t, %struct.int32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x4_t [[TMP19]]
 int32x2x4_t test_vld4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s32
   return vld4_lane_s32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.int64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.int64x1x4_t, %struct.int64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int64x1x4_t [[TMP19]]
 int64x1x4_t test_vld4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_s64
   return vld4_lane_s64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float16x4x4_t @test_vld4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float16x4x4_t, %struct.float16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float16x4x4_t [[TMP19]]
 float16x4x4_t test_vld4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f16
   return vld4_lane_f16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.aarch64.neon.ld4lane.v2f32.p0i8(<2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i64 1, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float32x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float32x2x4_t, %struct.float32x2x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x4_t [[TMP19]]
 float32x2x4_t test_vld4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f32
   return vld4_lane_f32(a, b, 1);
-  // CHECK: ld4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x double> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x double> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x double> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x double> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x double>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x double>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.aarch64.neon.ld4lane.v1f64.p0i8(<1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], <1 x double> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x double>, <1 x double>, <1 x double>, <1 x double> }*
+// CHECK:   store { <1 x double>, <1 x double>, <1 x double>, <1 x double> } [[VLD4_LANE]], { <1 x double>, <1 x double>, <1 x double>, <1 x double> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.float64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.float64x1x4_t, %struct.float64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float64x1x4_t [[TMP19]]
 float64x1x4_t test_vld4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_f64
   return vld4_lane_f64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly8x8x4_t @test_vld4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.aarch64.neon.ld4lane.v8i8.p0i8(<8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i64 7, i8* %a)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP2]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP8]], i8* [[TMP9]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP10:%.*]] = load %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x4_t [[TMP10]]
 poly8x8x4_t test_vld4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p8
   return vld4_lane_p8(a, b, 7);
-  // CHECK: ld4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly16x4x4_t @test_vld4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4lane.v4i16.p0i8(<4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i64 3, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly16x4x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x4_t [[TMP19]]
 poly16x4x4_t test_vld4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p16
   return vld4_lane_p16(a, b, 3);
-  // CHECK: ld4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   [[VLD4_LANE:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4lane.v1i64.p0i8(<1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i64 0, i8* [[TMP3]])
+// CHECK:   [[TMP16:%.*]] = bitcast i8* [[TMP2]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_LANE]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP16]]
+// CHECK:   [[TMP17:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP17]], i8* [[TMP18]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP19:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP19]]
 poly64x1x4_t test_vld4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vld4_lane_p64
   return vld4_lane_p64(a, b, 0);
-  // CHECK: ld4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t  *a, uint8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u8
   vst1q_lane_u8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t  *a, uint16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u16
   vst1q_lane_u16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t  *a, uint32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u32
   vst1q_lane_u32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t  *a, uint64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_u64
   vst1q_lane_u64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t  *a, int8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s8
   vst1q_lane_s8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t  *a, int16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s16
   vst1q_lane_s16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t  *a, int32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s32
   vst1q_lane_s32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t  *a, int64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_s64
   vst1q_lane_s64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t  *a, float16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f16
   vst1q_lane_f16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t  *a, float32x4_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f32
   vst1q_lane_f32(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_f64(float64_t  *a, float64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_f64
   vst1q_lane_f64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t  *a, poly8x16_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p8
   vst1q_lane_p8(a, b, 15);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t  *a, poly16x8_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p16
   vst1q_lane_p16(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_lane_p64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1q_lane_p64(poly64_t  *a, poly64x2_t b) {
-  // CHECK-LABEL: test_vst1q_lane_p64
   vst1q_lane_p64(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t  *a, uint8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_u8
   vst1_lane_u8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t  *a, uint16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_u16
   vst1_lane_u16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t  *a, uint32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_u32
   vst1_lane_u32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t  *a, uint64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_u64
   vst1_lane_u64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t  *a, int8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_s8
   vst1_lane_s8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t  *a, int16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_s16
   vst1_lane_s16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t  *a, int32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_s32
   vst1_lane_s32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t  *a, int64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_s64
   vst1_lane_s64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t  *a, float16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_f16
   vst1_lane_f16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t  *a, float32x2_t b) {
-  // CHECK-LABEL: test_vst1_lane_f32
   vst1_lane_f32(a, b, 1);
-  // CHECK: st1 {{{ *v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_f64(double* %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x double> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to double*
+// CHECK:   store double [[TMP3]], double* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_f64(float64_t  *a, float64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_f64
   vst1_lane_f64(a, b, 0);
-  // CHECK: {{st1 { v[0-9]+.d }\[0]|str d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t  *a, poly8x8_t b) {
-  // CHECK-LABEL: test_vst1_lane_p8
   vst1_lane_p8(a, b, 7);
-  // CHECK: st1 {{{ *v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t  *a, poly16x4_t b) {
-  // CHECK-LABEL: test_vst1_lane_p16
   vst1_lane_p16(a, b, 3);
-  // CHECK: st1 {{{ *v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_lane_p64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]]
+// CHECK:   ret void
 void test_vst1_lane_p64(poly64_t  *a, poly64x1_t b) {
-  // CHECK-LABEL: test_vst1_lane_p64
   vst1_lane_p64(a, b, 0);
-  // CHECK: st1 {{{ *v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_u8(uint8_t  *a, uint8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u8
   vst2q_lane_u8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t  *a, uint16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u16
   vst2q_lane_u16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t  *a, uint32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u32
   vst2q_lane_u32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_u64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x2_t, %struct.uint64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_u64(uint64_t  *a, uint64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_u64
   vst2q_lane_u64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_s8(int8_t  *a, int8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s8
   vst2q_lane_s8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t  *a, int16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s16
   vst2q_lane_s16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i32>] [[B]].coerce, [2 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i32.p0i8(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t  *a, int32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s32
   vst2q_lane_s32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x2_t, %struct.int64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_s64(int64_t  *a, int64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_s64
   vst2q_lane_s64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [2 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x half>] [[B]].coerce, [2 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t  *a, float16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f16
   vst2q_lane_f16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x float>] [[B]].coerce, [2 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4f32.p0i8(<4 x float> [[TMP7]], <4 x float> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t  *a, float32x4x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f32
   vst2q_lane_f32(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x double>] [[B]].coerce, [2 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x2_t, %struct.float64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x double>], [2 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f64.p0i8(<2 x double> [[TMP7]], <2 x double> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_f64(float64_t  *a, float64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_f64
   vst2q_lane_f64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p8(i8* %a, [2 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst2q_lane_p8(poly8_t  *a, poly8x16x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p8
   vst2q_lane_p8(a, b, 15);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [2 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i16>] [[B]].coerce, [2 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i16.p0i8(<8 x i16> [[TMP7]], <8 x i16> [[TMP8]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t  *a, poly16x8x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p16
   vst2q_lane_p16(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_lane_p64(i64* %a, [2 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[B]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_lane_p64(poly64_t  *a, poly64x2x2_t b) {
-  // CHECK-LABEL: test_vst2q_lane_p64
   vst2q_lane_p64(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t  *a, uint8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u8
   vst2_lane_u8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t  *a, uint16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u16
   vst2_lane_u16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t  *a, uint32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u32
   vst2_lane_u32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_u64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_u64(uint64_t  *a, uint64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_u64
   vst2_lane_u64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t  *a, int8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s8
   vst2_lane_s8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t  *a, int16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s16
   vst2_lane_s16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i32>] [[B]].coerce, [2 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2i32.p0i8(<2 x i32> [[TMP7]], <2 x i32> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t  *a, int32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s32
   vst2_lane_s32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_s64(int64_t  *a, int64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_s64
   vst2_lane_s64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x half>] [[B]].coerce, [2 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t  *a, float16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f16
   vst2_lane_f16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <2 x float>] [[B]].coerce, [2 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v2f32.p0i8(<2 x float> [[TMP7]], <2 x float> [[TMP8]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t  *a, float32x2x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f32
   vst2_lane_f32(a, b, 1);
-  // CHECK: st2 {{{ *v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x double>] [[B]].coerce, [2 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x2_t, %struct.float64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x double>], [2 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1f64.p0i8(<1 x double> [[TMP7]], <1 x double> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_f64(float64_t  *a, float64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_f64
   vst2_lane_f64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t  *a, poly8x8x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p8
   vst2_lane_p8(a, b, 7);
-  // CHECK: st2 {{{ *v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <4 x i16>] [[B]].coerce, [2 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v4i16.p0i8(<4 x i16> [[TMP7]], <4 x i16> [[TMP8]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t  *a, poly16x4x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p16
   vst2_lane_p16(a, b, 3);
-  // CHECK: st2 {{{ *v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_lane_p64(i64* %a, [2 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[B]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2lane.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_lane_p64(poly64_t  *a, poly64x1x2_t b) {
-  // CHECK-LABEL: test_vst2_lane_p64
   vst2_lane_p64(a, b, 0);
-  // CHECK: st2 {{{ *v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_u8(uint8_t  *a, uint8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u8
   vst3q_lane_u8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t  *a, uint16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u16
   vst3q_lane_u16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t  *a, uint32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u32
   vst3q_lane_u32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_u64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x3_t, %struct.uint64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_u64(uint64_t  *a, uint64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_u64
   vst3q_lane_u64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_s8(int8_t  *a, int8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s8
   vst3q_lane_s8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t  *a, int16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s16
   vst3q_lane_s16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i32>] [[B]].coerce, [3 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i32.p0i8(<4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t  *a, int32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s32
   vst3q_lane_s32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x3_t, %struct.int64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_s64(int64_t  *a, int64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_s64
   vst3q_lane_s64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [3 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x half>] [[B]].coerce, [3 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t  *a, float16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f16
   vst3q_lane_f16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x float>] [[B]].coerce, [3 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4f32.p0i8(<4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t  *a, float32x4x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f32
   vst3q_lane_f32(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x double>] [[B]].coerce, [3 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x3_t, %struct.float64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x double>], [3 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f64.p0i8(<2 x double> [[TMP9]], <2 x double> [[TMP10]], <2 x double> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_f64(float64_t  *a, float64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_f64
   vst3q_lane_f64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p8(i8* %a, [3 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst3q_lane_p8(poly8_t  *a, poly8x16x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p8
   vst3q_lane_p8(a, b, 15);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [3 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i16>] [[B]].coerce, [3 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i16.p0i8(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t  *a, poly16x8x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p16
   vst3q_lane_p16(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_lane_p64(i64* %a, [3 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[B]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_lane_p64(poly64_t  *a, poly64x2x3_t b) {
-  // CHECK-LABEL: test_vst3q_lane_p64
   vst3q_lane_p64(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t  *a, uint8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u8
   vst3_lane_u8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t  *a, uint16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u16
   vst3_lane_u16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t  *a, uint32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u32
   vst3_lane_u32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_u64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_u64(uint64_t  *a, uint64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_u64
   vst3_lane_u64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t  *a, int8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s8
   vst3_lane_s8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t  *a, int16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s16
   vst3_lane_s16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i32>] [[B]].coerce, [3 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2i32.p0i8(<2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t  *a, int32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s32
   vst3_lane_s32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_s64(int64_t  *a, int64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_s64
   vst3_lane_s64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x half>] [[B]].coerce, [3 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t  *a, float16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f16
   vst3_lane_f16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <2 x float>] [[B]].coerce, [3 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v2f32.p0i8(<2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t  *a, float32x2x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f32
   vst3_lane_f32(a, b, 1);
-  // CHECK: st3 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x double>] [[B]].coerce, [3 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x3_t, %struct.float64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x double>], [3 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1f64.p0i8(<1 x double> [[TMP9]], <1 x double> [[TMP10]], <1 x double> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_f64(float64_t  *a, float64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_f64
   vst3_lane_f64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t  *a, poly8x8x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p8
   vst3_lane_p8(a, b, 7);
-  // CHECK: st3 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <4 x i16>] [[B]].coerce, [3 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v4i16.p0i8(<4 x i16> [[TMP9]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t  *a, poly16x4x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p16
   vst3_lane_p16(a, b, 3);
-  // CHECK: st3 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_lane_p64(i64* %a, [3 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[B]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3lane.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_lane_p64(poly64_t  *a, poly64x1x3_t b) {
-  // CHECK-LABEL: test_vst3_lane_p64
   vst3_lane_p64(a, b, 0);
-  // CHECK: st3 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_u8(uint8_t  *a, uint8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u8
   vst4q_lane_u8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t  *a, uint16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u16
   vst4q_lane_u16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t  *a, uint32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u32
   vst4q_lane_u32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_u64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x2x4_t, %struct.uint64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_u64(uint64_t  *a, uint64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_u64
   vst4q_lane_u64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_s8(int8_t  *a, int8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s8
   vst4q_lane_s8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t  *a, int16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s16
   vst4q_lane_s16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i32>] [[B]].coerce, [4 x <4 x i32>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i32> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i32.p0i8(<4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t  *a, int32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s32
   vst4q_lane_s32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x2x4_t, %struct.int64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_s64(int64_t  *a, int64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_s64
   vst4q_lane_s64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [4 x <8 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x half>] [[B]].coerce, [4 x <8 x half>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x half> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t  *a, float16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f16
   vst4q_lane_f16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x float>] [[B]].coerce, [4 x <4 x float>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x float> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4f32.p0i8(<4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t  *a, float32x4x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f32
   vst4q_lane_f32(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x double>] [[B]].coerce, [4 x <2 x double>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x double> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x2x4_t, %struct.float64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x double> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f64.p0i8(<2 x double> [[TMP11]], <2 x double> [[TMP12]], <2 x double> [[TMP13]], <2 x double> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_f64(float64_t  *a, float64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_f64
   vst4q_lane_f64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p8(i8* %a, [4 x <16 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v16i8.p0i8(<16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i64 15, i8* %a)
+// CHECK:   ret void
 void test_vst4q_lane_p8(poly8_t  *a, poly8x16x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p8
   vst4q_lane_p8(a, b, 15);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[15], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [4 x <8 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i16>] [[B]].coerce, [4 x <8 x i16>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i16.p0i8(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], i64 7, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t  *a, poly16x8x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p16
   vst4q_lane_p16(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_lane_p64(i64* %a, [4 x <2 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[B]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_lane_p64(poly64_t  *a, poly64x2x4_t b) {
-  // CHECK-LABEL: test_vst4q_lane_p64
   vst4q_lane_p64(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t  *a, uint8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u8
   vst4_lane_u8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t  *a, uint16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u16
   vst4_lane_u16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t  *a, uint32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u32
   vst4_lane_u32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_u64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_u64(uint64_t  *a, uint64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_u64
   vst4_lane_u64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t  *a, int8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s8
   vst4_lane_s8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t  *a, int16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s16
   vst4_lane_s16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i32>] [[B]].coerce, [4 x <2 x i32>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2i32.p0i8(<2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t  *a, int32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s32
   vst4_lane_s32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_s64(int64_t  *a, int64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_s64
   vst4_lane_s64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x <4 x half>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x half>] [[B]].coerce, [4 x <4 x half>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x half> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t  *a, float16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f16
   vst4_lane_f16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <2 x float>] [[B]].coerce, [4 x <2 x float>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x float> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v2f32.p0i8(<2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], i64 1, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t  *a, float32x2x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f32
   vst4_lane_f32(a, b, 1);
-  // CHECK: st4 {{{ *v[0-9]+.s, v[0-9]+.s, v[0-9]+.s, v[0-9]+.s *}}}[1], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x double>] [[B]].coerce, [4 x <1 x double>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast double* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x double> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float64x1x4_t, %struct.float64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x double>], [4 x <1 x double>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x double>, <1 x double>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x double> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x double>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x double>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1f64.p0i8(<1 x double> [[TMP11]], <1 x double> [[TMP12]], <1 x double> [[TMP13]], <1 x double> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_f64(float64_t  *a, float64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_f64
   vst4_lane_f64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x <8 x i8>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v8i8.p0i8(<8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i64 7, i8* %a)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t  *a, poly8x8x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p8
   vst4_lane_p8(a, b, 7);
-  // CHECK: st4 {{{ *v[0-9]+.b, v[0-9]+.b, v[0-9]+.b, v[0-9]+.b *}}}[7], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x <4 x i16>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <4 x i16>] [[B]].coerce, [4 x <4 x i16>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v4i16.p0i8(<4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], i64 3, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t  *a, poly16x4x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p16
   vst4_lane_p16(a, b, 3);
-  // CHECK: st4 {{{ *v[0-9]+.h, v[0-9]+.h, v[0-9]+.h, v[0-9]+.h *}}}[3], [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_lane_p64(i64* %a, [4 x <1 x i64>] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[B]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4lane.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i64 0, i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_lane_p64(poly64_t  *a, poly64x1x4_t b) {
-  // CHECK-LABEL: test_vst4_lane_p64
   vst4_lane_p64(a, b, 0);
-  // CHECK: st4 {{{ *v[0-9]+.d, v[0-9]+.d, v[0-9]+.d, v[0-9]+.d *}}}[0], [{{x[0-9]+|sp}}]
 }
diff --git a/test/CodeGen/aarch64-neon-misc.c b/test/CodeGen/aarch64-neon-misc.c
index a2511976a4fab..4ecf562a5d29d 100644
--- a/test/CodeGen/aarch64-neon-misc.c
+++ b/test/CodeGen/aarch64-neon-misc.c
@@ -1,2041 +1,3068 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vceqz_s8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_s8(int8x8_t a) {
   return vceqz_s8(a);
 }
 
-// CHECK-LABEL: test_vceqz_s16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_s16(int16x4_t a) {
   return vceqz_s16(a);
 }
 
-// CHECK-LABEL: test_vceqz_s32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_s32(int32x2_t a) {
   return vceqz_s32(a);
 }
 
-// CHECK-LABEL: test_vceqz_s64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_s64(int64x1_t a) {
   return vceqz_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_u64(uint64x1_t a) {
   return vceqz_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_p64
-// CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vceqz_p64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_p64(poly64x1_t a) {
   return vceqz_p64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_s8(int8x16_t a) {
   return vceqzq_s8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_s16(int16x8_t a) {
   return vceqzq_s16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_s32(int32x4_t a) {
   return vceqzq_s32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_s64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_s64(int64x2_t a) {
   return vceqzq_s64(a);
 }
 
-// CHECK-LABEL: test_vceqz_u8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_u8(uint8x8_t a) {
   return vceqz_u8(a);
 }
 
-// CHECK-LABEL: test_vceqz_u16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_u16(uint16x4_t a) {
   return vceqz_u16(a);
 }
 
-// CHECK-LABEL: test_vceqz_u32
-// CHECK: cmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vceqz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_u32(uint32x2_t a) {
   return vceqz_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_u8(uint8x16_t a) {
   return vceqzq_u8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_u16(uint16x8_t a) {
   return vceqzq_u16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u32
-// CHECK: cmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_u32(uint32x4_t a) {
   return vceqzq_u32(a);
 }
 
-// CHECK-LABEL: test_vceqzq_u64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_u64(uint64x2_t a) {
   return vceqzq_u64(a);
 }
 
-// CHECK-LABEL: test_vceqz_f32
-// CHECK: fcmeq  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vceqz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCEQZ_I]]
 uint32x2_t test_vceqz_f32(float32x2_t a) {
   return vceqz_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_f64
-// CHECK: fcmeq  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vceqz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCEQZ_I]]
 uint64x1_t test_vceqz_f64(float64x1_t a) {
   return vceqz_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f32
-// CHECK: fcmeq  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vceqzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCEQZ_I]]
 uint32x4_t test_vceqzq_f32(float32x4_t a) {
   return vceqzq_f32(a);
 }
 
-// CHECK-LABEL: test_vceqz_p8
-// CHECK: cmeq  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vceqz_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCEQZ_I]]
 uint8x8_t test_vceqz_p8(poly8x8_t a) {
   return vceqz_p8(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p8
-// CHECK: cmeq  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vceqzq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp eq <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCEQZ_I]]
 uint8x16_t test_vceqzq_p8(poly8x16_t a) {
   return vceqzq_p8(a);
 }
 
-// CHECK-LABEL: test_vceqz_p16
-// CHECK: cmeq  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vceqz_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCEQZ_I]]
 uint16x4_t test_vceqz_p16(poly16x4_t a) {
   return vceqz_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p16
-// CHECK: cmeq  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vceqzq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp eq <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCEQZ_I]]
 uint16x8_t test_vceqzq_p16(poly16x8_t a) {
   return vceqzq_p16(a);
 }
 
-// CHECK-LABEL: test_vceqzq_f64
-// CHECK: fcmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oeq <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_f64(float64x2_t a) {
   return vceqzq_f64(a);
 }
 
-// CHECK-LABEL: test_vceqzq_p64
-// CHECK: cmeq  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vceqzq_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp eq <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCEQZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCEQZ_I]]
 uint64x2_t test_vceqzq_p64(poly64x2_t a) {
   return vceqzq_p64(a);
 }
 
-// CHECK-LABEL: test_vcgez_s8
-// CHECK: cmge  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGEZ_I]]
 uint8x8_t test_vcgez_s8(int8x8_t a) {
   return vcgez_s8(a);
 }
 
-// CHECK-LABEL: test_vcgez_s16
-// CHECK: cmge  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGEZ_I]]
 uint16x4_t test_vcgez_s16(int16x4_t a) {
   return vcgez_s16(a);
 }
 
-// CHECK-LABEL: test_vcgez_s32
-// CHECK: cmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_s32(int32x2_t a) {
   return vcgez_s32(a);
 }
 
-// CHECK-LABEL: test_vcgez_s64
-// CHECK: cmge {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_s64(int64x1_t a) {
   return vcgez_s64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s8
-// CHECK: cmge  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sge <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGEZ_I]]
 uint8x16_t test_vcgezq_s8(int8x16_t a) {
   return vcgezq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s16
-// CHECK: cmge  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sge <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGEZ_I]]
 uint16x8_t test_vcgezq_s16(int16x8_t a) {
   return vcgezq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s32
-// CHECK: cmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sge <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_s32(int32x4_t a) {
   return vcgezq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_s64
-// CHECK: cmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sge <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_s64(int64x2_t a) {
   return vcgezq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgez_f32
-// CHECK: fcmge  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGEZ_I]]
 uint32x2_t test_vcgez_f32(float32x2_t a) {
   return vcgez_f32(a);
 }
 
-// CHECK-LABEL: test_vcgez_f64
-// CHECK: fcmge  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGEZ_I]]
 uint64x1_t test_vcgez_f64(float64x1_t a) {
   return vcgez_f64(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f32
-// CHECK: fcmge  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGEZ_I]]
 uint32x4_t test_vcgezq_f32(float32x4_t a) {
   return vcgezq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgezq_f64
-// CHECK: fcmge  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp oge <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGEZ_I]]
 uint64x2_t test_vcgezq_f64(float64x2_t a) {
   return vcgezq_f64(a);
 }
 
-// CHECK-LABEL: test_vclez_s8
-// CHECK: cmle  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vclez_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLEZ_I]]
 uint8x8_t test_vclez_s8(int8x8_t a) {
   return vclez_s8(a);
 }
 
-// CHECK-LABEL: test_vclez_s16
-// CHECK: cmle  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vclez_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLEZ_I]]
 uint16x4_t test_vclez_s16(int16x4_t a) {
   return vclez_s16(a);
 }
 
-// CHECK-LABEL: test_vclez_s32
-// CHECK: cmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vclez_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_s32(int32x2_t a) {
   return vclez_s32(a);
 }
 
-// CHECK-LABEL: test_vclez_s64
-// CHECK: cmle {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vclez_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_s64(int64x1_t a) {
   return vclez_s64(a);
 }
 
-// CHECK-LABEL: test_vclezq_s8
-// CHECK: cmle  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vclezq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sle <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLEZ_I]]
 uint8x16_t test_vclezq_s8(int8x16_t a) {
   return vclezq_s8(a);
 }
 
-// CHECK-LABEL: test_vclezq_s16
-// CHECK: cmle  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vclezq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sle <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLEZ_I]]
 uint16x8_t test_vclezq_s16(int16x8_t a) {
   return vclezq_s16(a);
 }
 
-// CHECK-LABEL: test_vclezq_s32
-// CHECK: cmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vclezq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sle <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_s32(int32x4_t a) {
   return vclezq_s32(a);
 }
 
-// CHECK-LABEL: test_vclezq_s64
-// CHECK: cmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vclezq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sle <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_s64(int64x2_t a) {
   return vclezq_s64(a);
 }
 
-// CHECK-LABEL: test_vclez_f32
-// CHECK: fcmle  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vclez_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLEZ_I]]
 uint32x2_t test_vclez_f32(float32x2_t a) {
   return vclez_f32(a);
 }
 
-// CHECK-LABEL: test_vclez_f64
-// CHECK: fcmle  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vclez_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLEZ_I]]
 uint64x1_t test_vclez_f64(float64x1_t a) {
   return vclez_f64(a);
 }
 
-// CHECK-LABEL: test_vclezq_f32
-// CHECK: fcmle  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vclezq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLEZ_I]]
 uint32x4_t test_vclezq_f32(float32x4_t a) {
   return vclezq_f32(a);
 }
 
-// CHECK-LABEL: test_vclezq_f64
-// CHECK: fcmle  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vclezq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ole <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLEZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLEZ_I]]
 uint64x2_t test_vclezq_f64(float64x2_t a) {
   return vclezq_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s8
-// CHECK: cmgt  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i8> @test_vcgtz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCGTZ_I]]
 uint8x8_t test_vcgtz_s8(int8x8_t a) {
   return vcgtz_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s16
-// CHECK: cmgt  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i16> @test_vcgtz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCGTZ_I]]
 uint16x4_t test_vcgtz_s16(int16x4_t a) {
   return vcgtz_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s32
-// CHECK: cmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_s32(int32x2_t a) {
   return vcgtz_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_s64
-// CHECK: cmgt {{d[0-9]+}}, {{d[0-9]+}}, #{{0x0|0}}
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_s64(int64x1_t a) {
   return vcgtz_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s8
-// CHECK: cmgt  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #{{0x0|0}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp sgt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCGTZ_I]]
 uint8x16_t test_vcgtzq_s8(int8x16_t a) {
   return vcgtzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s16
-// CHECK: cmgt  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #{{0x0|0}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCGTZ_I]]
 uint16x8_t test_vcgtzq_s16(int16x8_t a) {
   return vcgtzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s32
-// CHECK: cmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #{{0x0|0}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_s32(int32x4_t a) {
   return vcgtzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_s64
-// CHECK: cmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #{{0x0|0}}
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp sgt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_s64(int64x2_t a) {
   return vcgtzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f32
-// CHECK: fcmgt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcgtz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCGTZ_I]]
 uint32x2_t test_vcgtz_f32(float32x2_t a) {
   return vcgtz_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtz_f64
-// CHECK: fcmgt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcgtz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCGTZ_I]]
 uint64x1_t test_vcgtz_f64(float64x1_t a) {
   return vcgtz_f64(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f32
-// CHECK: fcmgt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcgtzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCGTZ_I]]
 uint32x4_t test_vcgtzq_f32(float32x4_t a) {
   return vcgtzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcgtzq_f64
-// CHECK: fcmgt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcgtzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp ogt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCGTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCGTZ_I]]
 uint64x2_t test_vcgtzq_f64(float64x2_t a) {
   return vcgtzq_f64(a);
 }
 
-// CHECK-LABEL: test_vcltz_s8
-// CHECK: sshr  {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #7
+// CHECK-LABEL: define <8 x i8> @test_vcltz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <8 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VCLTZ_I]]
 uint8x8_t test_vcltz_s8(int8x8_t a) {
   return vcltz_s8(a);
 }
 
-// CHECK-LABEL: test_vcltz_s16
-// CHECK: sshr  {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #15
+// CHECK-LABEL: define <4 x i16> @test_vcltz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VCLTZ_I]]
 uint16x4_t test_vcltz_s16(int16x4_t a) {
   return vcltz_s16(a);
 }
 
-// CHECK-LABEL: test_vcltz_s32
-// CHECK: sshr  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #31
+// CHECK-LABEL: define <2 x i32> @test_vcltz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_s32(int32x2_t a) {
   return vcltz_s32(a);
 }
 
-// CHECK-LABEL: test_vcltz_s64
-// CHECK: sshr {{d[0-9]+}}, {{d[0-9]+}}, #63
+// CHECK-LABEL: define <1 x i64> @test_vcltz_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <1 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_s64(int64x1_t a) {
   return vcltz_s64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s8
-// CHECK: sshr  {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #7
+// CHECK-LABEL: define <16 x i8> @test_vcltzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = icmp slt <16 x i8> %a, zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <16 x i1> [[TMP0]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VCLTZ_I]]
 uint8x16_t test_vcltzq_s8(int8x16_t a) {
   return vcltzq_s8(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s16
-// CHECK: sshr  {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #15
+// CHECK-LABEL: define <8 x i16> @test_vcltzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = icmp slt <8 x i16> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <8 x i1> [[TMP2]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VCLTZ_I]]
 uint16x8_t test_vcltzq_s16(int16x8_t a) {
   return vcltzq_s16(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s32
-// CHECK: sshr  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #31
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = icmp slt <4 x i32> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_s32(int32x4_t a) {
   return vcltzq_s32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_s64
-// CHECK: sshr  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #63
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = icmp slt <2 x i64> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_s64(int64x2_t a) {
   return vcltzq_s64(a);
 }
 
-// CHECK-LABEL: test_vcltz_f32
-// CHECK: fcmlt  {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+// CHECK-LABEL: define <2 x i32> @test_vcltz_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCLTZ_I]]
 uint32x2_t test_vcltz_f32(float32x2_t a) {
   return vcltz_f32(a);
 }
  
-// CHECK-LABEL: test_vcltz_f64
-// CHECK: fcmlt  {{d[0-9]+}}, {{d[0-9]+}}, #0
+// CHECK-LABEL: define <1 x i64> @test_vcltz_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <1 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <1 x i1> [[TMP2]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VCLTZ_I]]
 uint64x1_t test_vcltz_f64(float64x1_t a) {
   return vcltz_f64(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f32
-// CHECK: fcmlt  {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+// CHECK-LABEL: define <4 x i32> @test_vcltzq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <4 x float> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <4 x i1> [[TMP2]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCLTZ_I]]
 uint32x4_t test_vcltzq_f32(float32x4_t a) {
   return vcltzq_f32(a);
 }
 
-// CHECK-LABEL: test_vcltzq_f64
-// CHECK: fcmlt  {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+// CHECK-LABEL: define <2 x i64> @test_vcltzq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fcmp olt <2 x double> [[TMP1]], zeroinitializer
+// CHECK:   [[VCLTZ_I:%.*]] = sext <2 x i1> [[TMP2]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VCLTZ_I]]
 uint64x2_t test_vcltzq_f64(float64x2_t a) {
   return vcltzq_f64(a);
 }
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: rev16 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: rev16 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: rev32 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: rev32 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: rev32 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: rev32 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: rev64 v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: rev64 v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: rev64 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: rev64 v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: rev64 v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: rev64 v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_s8
   return vpaddl_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_s16
   return vpaddl_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_s32
   return vpaddl_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vpaddl_u8
   return vpaddl_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vpaddl_u16
   return vpaddl_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vpaddl_u32
   return vpaddl_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_s8
   return vpaddlq_s8(a);
-  // CHECK: saddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_s16
   return vpaddlq_s16(a);
-  // CHECK: saddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_s32
   return vpaddlq_s32(a);
-  // CHECK: saddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vpaddlq_u8
   return vpaddlq_u8(a);
-  // CHECK: uaddlp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #2
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vpaddlq_u16
   return vpaddlq_u16(a);
-  // CHECK: uaddlp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #2
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vpaddlq_u32
   return vpaddlq_u32(a);
-  // CHECK: uaddlp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vpadal_s8
   return vpadal_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vpadal_s16
   return vpadal_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vpadal_s32
   return vpadal_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = add <4 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vpadal_u8
   return vpadal_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4h, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = add <2 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vpadal_u16
   return vpadal_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = add <1 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vpadal_u32
   return vpadal_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.1d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_s8
   return vpadalq_s8(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_s16
   return vpadalq_s16(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_s32
   return vpadalq_s32(a, b);
-  // CHECK: sadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %b) #2
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = add <8 x i16> [[VPADAL_I]], [[TMP1]]
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vpadalq_u8
   return vpadalq_u8(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.8h, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = add <4 x i32> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vpadalq_u16
   return vpadalq_u16(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADAL_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[VPADAL_I]]) #2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = add <2 x i64> [[VPADAL1_I]], [[TMP2]]
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vpadalq_u32
   return vpadalq_u32(a, b);
-  // CHECK: uadalp v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqabs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqabs_s8
   return vqabs_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqabs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqabsq_s8
   return vqabsq_s8(a);
-  // CHECK: sqabs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqabs_s16
   return vqabs_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqabsq_s16
   return vqabsq_s16(a);
-  // CHECK: sqabs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #2
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqabs_s32
   return vqabs_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqabsq_s32
   return vqabsq_s32(a);
-  // CHECK: sqabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqabs.v2i64(<2 x i64> [[VQABSQ_V_I]]) #2
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <2 x i64> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqabsq_s64
   return vqabsq_s64(a);
-  // CHECK: sqabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqneg.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vqneg_s8
   return vqneg_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.sqneg.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vqnegq_s8
   return vqnegq_s8(a);
-  // CHECK: sqneg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vqneg_s16
   return vqneg_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.sqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqnegq_s16
   return vqnegq_s16(a);
-  // CHECK: sqneg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #2
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vqneg_s32
   return vqneg_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqnegq_s32
   return vqnegq_s32(a);
-  // CHECK: sqneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vqnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqneg.v2i64(<2 x i64> [[VQNEGQ_V_I]]) #2
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <2 x i64> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP1]]
 int64x2_t test_vqnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqnegq_s64
   return vqnegq_s64(a);
-  // CHECK: sqneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vneg_s8
   return vneg_s8(a);
-  // CHECK: neg v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vnegq_s8
   return vnegq_s8(a);
-  // CHECK: neg v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vneg_s16
   return vneg_s16(a);
-  // CHECK: neg v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vnegq_s16
   return vnegq_s16(a);
-  // CHECK: neg v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vneg_s32
   return vneg_s32(a);
-  // CHECK: neg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vnegq_s32
   return vnegq_s32(a);
-  // CHECK: neg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vnegq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> zeroinitializer, %a
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vnegq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vnegq_s64
   return vnegq_s64(a);
-  // CHECK: neg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vneg_f32
   return vneg_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vnegq_f32
   return vnegq_f32(a);
-  // CHECK: fneg v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vnegq_f64(<2 x double> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
+// CHECK:   ret <2 x double> [[SUB_I]]
 float64x2_t test_vnegq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vnegq_f64
   return vnegq_f64(a);
-  // CHECK: fneg v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.abs.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vabs_s8
   return vabs_s8(a);
-  // CHECK: abs v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.abs.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vabsq_s8
   return vabsq_s8(a);
-  // CHECK: abs v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.abs.v4i16(<4 x i16> [[VABS_I]]) #2
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vabs_s16
   return vabs_s16(a);
-  // CHECK: abs v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.abs.v8i16(<8 x i16> [[VABS_I]]) #2
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vabsq_s16
   return vabsq_s16(a);
-  // CHECK: abs v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.abs.v2i32(<2 x i32> [[VABS_I]]) #2
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vabs_s32
   return vabs_s32(a);
-  // CHECK: abs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.abs.v4i32(<4 x i32> [[VABS_I]]) #2
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vabsq_s32
   return vabsq_s32(a);
-  // CHECK: abs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vabsq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.abs.v2i64(<2 x i64> [[VABS_I]]) #2
+// CHECK:   ret <2 x i64> [[VABS1_I]]
 int64x2_t test_vabsq_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vabsq_s64
   return vabsq_s64(a);
-  // CHECK: abs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #2
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vabs_f32
   return vabs_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #2
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vabsq_f32
   return vabsq_f32(a);
-  // CHECK: fabs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vabsq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[VABS_I]]) #2
+// CHECK:   ret <2 x double> [[VABS1_I]]
 float64x2_t test_vabsq_f64(float64x2_t a) {
-  // CHECK-LABEL: test_vabsq_f64
   return vabsq_f64(a);
-  // CHECK: fabs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.suqadd.v8i8(<8 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VUQADD_I]]
 int8x8_t test_vuqadd_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuqadd_s8
   return vuqadd_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VUQADD_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.suqadd.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VUQADD_I]]
 int8x16_t test_vuqaddq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuqaddq_s8
   return vuqaddq_s8(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.suqadd.v4i16(<4 x i16> [[VUQADD_I]], <4 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i16> [[VUQADD2_I]]
 int16x4_t test_vuqadd_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuqadd_s16
   return vuqadd_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VUQADD2_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.suqadd.v8i16(<8 x i16> [[VUQADD_I]], <8 x i16> [[VUQADD1_I]]) #2
+// CHECK:   ret <8 x i16> [[VUQADD2_I]]
 int16x8_t test_vuqaddq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuqaddq_s16
   return vuqaddq_s16(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.suqadd.v2i32(<2 x i32> [[VUQADD_I]], <2 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i32> [[VUQADD2_I]]
 int32x2_t test_vuqadd_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuqadd_s32
   return vuqadd_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VUQADD2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.suqadd.v4i32(<4 x i32> [[VUQADD_I]], <4 x i32> [[VUQADD1_I]]) #2
+// CHECK:   ret <4 x i32> [[VUQADD2_I]]
 int32x4_t test_vuqaddq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuqaddq_s32
   return vuqaddq_s32(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VUQADD_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VUQADD1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VUQADD2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.suqadd.v2i64(<2 x i64> [[VUQADD_I]], <2 x i64> [[VUQADD1_I]]) #2
+// CHECK:   ret <2 x i64> [[VUQADD2_I]]
 int64x2_t test_vuqaddq_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuqaddq_s64
   return vuqaddq_s64(a, b);
-  // CHECK: suqadd v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.cls.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcls_s8
   return vcls_s8(a);
-  // CHECK: cls v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.cls.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclsq_s8
   return vclsq_s8(a);
-  // CHECK: cls v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.cls.v4i16(<4 x i16> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vcls_s16
   return vcls_s16(a);
-  // CHECK: cls v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.cls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclsq_s16
   return vclsq_s16(a);
-  // CHECK: cls v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.cls.v2i32(<2 x i32> [[VCLS_V_I]]) #2
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vcls_s32
   return vcls_s32(a);
-  // CHECK: cls v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.cls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #2
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclsq_s32
   return vclsq_s32(a);
-  // CHECK: cls v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vclz_s8
   return vclz_s8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vclzq_s8
   return vclzq_s8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vclz_s16
   return vclz_s16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vclzq_s16
   return vclzq_s16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vclz_s32
   return vclz_s32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vclzq_s32
   return vclzq_s32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #2
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vclz_u8
   return vclz_u8(a);
-  // CHECK: clz v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #2
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vclzq_u8
   return vclzq_u8(a);
-  // CHECK: clz v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vclz_u16
   return vclz_u16(a);
-  // CHECK: clz v{{[0-9]+}}.4h, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vclzq_u16
   return vclzq_u16(a);
-  // CHECK: clz v{{[0-9]+}}.8h, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #2
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vclz_u32
   return vclz_u32(a);
-  // CHECK: clz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #2
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vclzq_u32
   return vclzq_u32(a);
-  // CHECK: clz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vcnt_s8
   return vcnt_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vcntq_s8
   return vcntq_s8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vcnt_u8
   return vcnt_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vcntq_u8
   return vcntq_u8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vcnt_p8
   return vcnt_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vcntq_p8
   return vcntq_p8(a);
-  // CHECK: cnt v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vmvn_s8
   return vmvn_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_s8
   return vmvnq_s8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vmvn_s16
   return vmvn_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_s16
   return vmvnq_s16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vmvn_s32
   return vmvn_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_s32
   return vmvnq_s32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vmvn_u8
   return vmvn_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_u8
   return vmvnq_u8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vmvn_u16
   return vmvn_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmvnq_u16
   return vmvnq_u16(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vmvn_u32
   return vmvn_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmvnq_u32
   return vmvnq_u32(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vmvn_p8
   return vmvn_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vmvnq_p8
   return vmvnq_p8(a);
-  // CHECK: {{mvn|not}} v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 int8x8_t test_vrbit_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vrbit_s8
   return vrbit_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 int8x16_t test_vrbitq_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_s8
   return vrbitq_s8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 uint8x8_t test_vrbit_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vrbit_u8
   return vrbit_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 uint8x16_t test_vrbitq_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_u8
   return vrbitq_u8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vrbit_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.rbit.v8i8(<8 x i8> %a) #2
+// CHECK:   ret <8 x i8> [[VRBIT_I]]
 poly8x8_t test_vrbit_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vrbit_p8
   return vrbit_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.8b, v{{[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vrbitq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VRBIT_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.rbit.v16i8(<16 x i8> %a) #2
+// CHECK:   ret <16 x i8> [[VRBIT_I]]
 poly8x16_t test_vrbitq_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vrbitq_p8
   return vrbitq_p8(a);
-  // CHECK: rbit v{{[0-9]+}}.16b, v{{[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vmovn_s16
   return vmovn_s16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vmovn_s32
   return vmovn_s32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vmovn_s64
   return vmovn_s64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vmovn_u16
   return vmovn_u16(a);
-  // CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vmovn_u32
   return vmovn_u32(a);
-  // CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vmovn_u64
   return vmovn_u64(a);
-  // CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_s16
   return vmovn_high_s16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_s32
   return vmovn_high_s32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_s64
   return vmovn_high_s64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VMOVN_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vmovn_high_u16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vmovn_high_u16
   return vmovn_high_u16(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[VMOVN_I_I]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vmovn_high_u32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vmovn_high_u32
   return vmovn_high_u32(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[VMOVN_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vmovn_high_u64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vmovn_high_u64
   return vmovn_high_u64(a, b);
-  // CHECK: xtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 int8x8_t test_vqmovun_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovun_s16
   return vqmovun_s16(a);
-  // CHECK: sqxtun v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovun_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovun_s32
   return vqmovun_s32(a);
-  // CHECK: sqxtun v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovun_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovun_s64
   return vqmovun_s64(a);
-  // CHECK: sqxtun v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovun_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtun.v8i8(<8 x i16> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVUN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovun_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s16
   return vqmovun_high_s16(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovun_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtun.v4i16(<4 x i32> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovun_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s32
   return vqmovun_high_s32(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovun_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtun.v2i32(<2 x i64> [[VQMOVUN_V_I_I]]) #2
+// CHECK:   [[VQMOVUN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovun_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovun_high_s64
   return vqmovun_high_s64(a, b);
-  // CHECK: sqxtun2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_s16
   return vqmovn_s16(a);
-  // CHECK: sqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_s32
   return vqmovn_s32(a);
-  // CHECK: sqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_s64
   return vqmovn_s64(a);
-  // CHECK: sqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_s16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.sqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 int8x16_t test_vqmovn_high_s16(int8x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s16
   return vqmovn_high_s16(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_s32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 int16x8_t test_vqmovn_high_s32(int16x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s32
   return vqmovn_high_s32(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_s64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.sqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 int32x4_t test_vqmovn_high_s64(int32x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_s64
   return vqmovn_high_s64(a, b);
-  // CHECK: sqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I]]) #2
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vqmovn_u16
   return vqmovn_u16(a);
-  // CHECK: uqxtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vqmovn_u32
   return vqmovn_u32(a);
-  // CHECK: uqxtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I]]) #2
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vqmovn_u64
   return vqmovn_u64(a);
-  // CHECK: uqxtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqmovn_high_u16(<8 x i8> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.uqxtn.v8i8(<8 x i16> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> [[VQMOVN_V1_I_I]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I_I]]
 uint8x16_t test_vqmovn_high_u16(uint8x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u16
   return vqmovn_high_u16(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.16b, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vqmovn_high_u32(<4 x i16> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uqxtn.v4i16(<4 x i32> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <4 x i16>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I_I]]
 uint16x8_t test_vqmovn_high_u32(uint16x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u32
   return vqmovn_high_u32(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vqmovn_high_u64(<2 x i32> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQMOVN_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uqxtn.v2i32(<2 x i64> [[VQMOVN_V_I_I]]) #2
+// CHECK:   [[VQMOVN_V2_I_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I_I]] to <2 x i32>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I_I]]
 uint32x4_t test_vqmovn_high_u64(uint32x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vqmovn_high_u64
   return vqmovn_high_u64(a, b);
-  // CHECK: uqxtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_s8
   return vshll_n_s8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_s16
   return vshll_n_s16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_s32
   return vshll_n_s32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vshll_n_u8
   return vshll_n_u8(a, 8);
-  // CHECK: shll {{v[0-9]+}}.8h, {{v[0-9]+}}.8b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vshll_n_u16
   return vshll_n_u16(a, 16);
-  // CHECK: shll {{v[0-9]+}}.4s, {{v[0-9]+}}.4h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vshll_n_u32
   return vshll_n_u32(a, 32);
-  // CHECK: shll {{v[0-9]+}}.2d, {{v[0-9]+}}.2s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_high_n_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s8
   return vshll_high_n_s8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_high_n_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s16
   return vshll_high_n_s16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_high_n_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_s32
   return vshll_high_n_s32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vshll_high_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> [[SHUFFLE_I]] to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_high_n_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u8
   return vshll_high_n_u8(a, 8);
-  // CHECK: shll2 {{v[0-9]+}}.8h, {{v[0-9]+}}.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vshll_high_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_high_n_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u16
   return vshll_high_n_u16(a, 16);
-  // CHECK: shll2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, #16
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vshll_high_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 32, i64 32>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_high_n_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vshll_high_n_u32
   return vshll_high_n_u32(a, 32);
-  // CHECK: shll2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, #32
 }
 
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_f16_f32
   return vcvt_f16_f32(a);
-  // CHECK: fcvtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcvt_high_f16_f32(<4 x half> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I_I]]) #2
+// CHECK:   [[VCVT_F16_F322_I_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I_I]] to <4 x half>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x half> %a, <4 x half> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I_I]]
 float16x8_t test_vcvt_high_f16_f32(float16x4_t a, float32x4_t b) {
-  //CHECK-LABEL: test_vcvt_high_f16_f32
   return vcvt_high_f16_f32(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.8h, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f64
   return vcvt_f32_f64(a);
-  // CHECK: fcvtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVT_I_I:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVT_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvt_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvt_high_f32_f64
   return vcvt_high_f32_f64(a, b);
-  // CHECK: fcvtn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvtx_f32_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I]]) #2
+// CHECK:   ret <2 x float> [[VCVTX_F32_V1_I]]
 float32x2_t test_vcvtx_f32_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtx_f32_f64
   return vcvtx_f32_f64(a);
-  // CHECK: fcvtxn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[VCVTX_F32_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTX_F32_V1_I_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> [[VCVTX_F32_V_I_I]]) #2
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x float> %a, <2 x float> [[VCVTX_F32_V1_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I_I]]
 float32x4_t test_vcvtx_high_f32_f64(float32x2_t a, float64x2_t b) {
-  //CHECK-LABEL: test_vcvtx_high_f32_f64
   return vcvtx_high_f32_f64(a, b);
-  // CHECK: fcvtxn2 v{{[0-9]+}}.4s, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
-  //CHECK-LABEL: test_vcvt_f32_f16
   return vcvt_f32_f16(a);
-  // CHECK: fcvtl v{{[0-9]+}}.4s, v{{[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvt_high_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I_I:%.*]] = call <4 x float> @llvm.aarch64.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I_I]]) #2
+// CHECK:   [[VCVT_F32_F162_I_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_high_f32_f16(float16x8_t a) {
-  //CHECK-LABEL: test_vcvt_high_f32_f16
   return vcvt_high_f32_f16(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.4s, v{{[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_f64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvt_f64_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f64_f32
   return vcvt_f64_f32(a);
-  // CHECK: fcvtl v{{[0-9]+}}.2d, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> [[SHUFFLE_I_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I_I:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I_I]]
 float64x2_t test_vcvt_high_f64_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvt_high_f64_f32
   return vcvt_high_f64_f32(a);
-  // CHECK: fcvtl2 v{{[0-9]+}}.2d, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frintn.v2f32(<2 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDN1_I]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndn_f32
   return vrndn_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDN1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frintn.v4f32(<4 x float> [[VRNDN_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDN1_I]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndnq_f32
   return vrndnq_f32(a);
-  // CHECK: frintn v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndnq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDN1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frintn.v2f64(<2 x double> [[VRNDN_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDN1_I]]
 float64x2_t test_vrndnq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndnq_f64
   return vrndnq_f64(a);
-  // CHECK: frintn v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x float> @llvm.round.v2f32(<2 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDA1_I]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnda_f32
   return vrnda_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDA1_I:%.*]] = call <4 x float> @llvm.round.v4f32(<4 x float> [[VRNDA_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDA1_I]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndaq_f32
   return vrndaq_f32(a);
-  // CHECK: frinta v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndaq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDA1_I:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[VRNDA_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDA1_I]]
 float64x2_t test_vrndaq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndaq_f64
   return vrndaq_f64(a);
-  // CHECK: frinta v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x float> @llvm.ceil.v2f32(<2 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDP1_I]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndp_f32
   return vrndp_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDP1_I:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[VRNDP_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDP1_I]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndpq_f32
   return vrndpq_f32(a);
-  // CHECK: frintp v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndpq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDP1_I:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[VRNDP_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDP1_I]]
 float64x2_t test_vrndpq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndpq_f64
   return vrndpq_f64(a);
-  // CHECK: frintp v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x float> @llvm.floor.v2f32(<2 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDM1_I]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndm_f32
   return vrndm_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDM1_I:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[VRNDM_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDM1_I]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndmq_f32
   return vrndmq_f32(a);
-  // CHECK: frintm v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndmq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDM1_I:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[VRNDM_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDM1_I]]
 float64x2_t test_vrndmq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndmq_f64
   return vrndmq_f64(a);
-  // CHECK: frintm v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x float> @llvm.rint.v2f32(<2 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDX1_I]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndx_f32
   return vrndx_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDX1_I:%.*]] = call <4 x float> @llvm.rint.v4f32(<4 x float> [[VRNDX_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDX1_I]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndxq_f32
   return vrndxq_f32(a);
-  // CHECK: frintx v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndxq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDX_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDX1_I:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[VRNDX_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDX1_I]]
 float64x2_t test_vrndxq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndxq_f64
   return vrndxq_f64(a);
-  // CHECK: frintx v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x float> @llvm.trunc.v2f32(<2 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDZ1_I]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrnd_f32
   return vrnd_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <4 x float> @llvm.trunc.v4f32(<4 x float> [[VRNDZ_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDZ1_I]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndq_f32
   return vrndq_f32(a);
-  // CHECK: frintz v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDZ_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDZ1_I:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[VRNDZ_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDZ1_I]]
 float64x2_t test_vrndq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndq_f64
   return vrndq_f64(a);
-  // CHECK: frintz v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndi_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x float> [[VRNDI1_I]]
 float32x2_t test_vrndi_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrndi_f32
   return vrndi_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndiq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDI1_I:%.*]] = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> [[VRNDI_I]]) #2
+// CHECK:   ret <4 x float> [[VRNDI1_I]]
 float32x4_t test_vrndiq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrndiq_f32
   return vrndiq_f32(a);
-  // CHECK: frinti v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrndiq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRNDI_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRNDI1_I:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[VRNDI_I]]) #2
+// CHECK:   ret <2 x double> [[VRNDI1_I]]
 float64x2_t test_vrndiq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrndiq_f64
   return vrndiq_f64(a);
-  // CHECK: frinti v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_s32_f32
   return vcvt_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_s32_f32
   return vcvtq_s32_f32(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptosi <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vcvtq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_s64_f64
   return vcvtq_s64_f64(a);
-  // CHECK: fcvtzs v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvt_u32_f32
   return vcvt_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP2:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_u32_f32
   return vcvtq_u32_f32(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[TMP2:%.*]] = fptoui <2 x double> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vcvtq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_u64_f64
   return vcvtq_u64_f64(a);
-  // CHECK: fcvtzu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtns.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_s32_f32
   return vcvtn_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtns.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_s32_f32
   return vcvtnq_s32_f32(a);
-  // CHECK: fcvtns v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtns.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 int64x2_t test_vcvtnq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_s64_f64
   return vcvtnq_s64_f64(a);
-  // CHECK: fcvtns v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtn_u32_f32
   return vcvtn_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTN1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtnu.v4i32.v4f32(<4 x float> [[VCVTN_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTN1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtnq_u32_f32
   return vcvtnq_u32_f32(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtnq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTN_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTN1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtnu.v2i64.v2f64(<2 x double> [[VCVTN_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTN1_I]]
 uint64x2_t test_vcvtnq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtnq_u64_f64
   return vcvtnq_u64_f64(a);
-  // CHECK: fcvtnu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtps.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_s32_f32
   return vcvtp_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtps.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_s32_f32
   return vcvtpq_s32_f32(a);
-  // CHECK: fcvtps v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtps.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 int64x2_t test_vcvtpq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_s64_f64
   return vcvtpq_s64_f64(a);
-  // CHECK: fcvtps v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtp_u32_f32
   return vcvtp_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTP1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtpu.v4i32.v4f32(<4 x float> [[VCVTP_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTP1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtpq_u32_f32
   return vcvtpq_u32_f32(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtpq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTP_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTP1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtpu.v2i64.v2f64(<2 x double> [[VCVTP_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTP1_I]]
 uint64x2_t test_vcvtpq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtpq_u64_f64
   return vcvtpq_u64_f64(a);
-  // CHECK: fcvtpu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtms.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_s32_f32
   return vcvtm_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtms.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_s32_f32
   return vcvtmq_s32_f32(a);
-  // CHECK: fcvtms v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtms.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 int64x2_t test_vcvtmq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_s64_f64
   return vcvtmq_s64_f64(a);
-  // CHECK: fcvtms v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvtm_u32_f32
   return vcvtm_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTM1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtmu.v4i32.v4f32(<4 x float> [[VCVTM_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTM1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtmq_u32_f32
   return vcvtmq_u32_f32(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtmq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTM_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTM1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtmu.v2i64.v2f64(<2 x double> [[VCVTM_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTM1_I]]
 uint64x2_t test_vcvtmq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtmq_u64_f64
   return vcvtmq_u64_f64(a);
-  // CHECK: fcvtmu v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtas.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_s32_f32
   return vcvta_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtas.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_s32_f32
   return vcvtaq_s32_f32(a);
-  // CHECK: fcvtas v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_s64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtas.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 int64x2_t test_vcvtaq_s64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_s64_f64
   return vcvtaq_s64_f64(a);
-  // CHECK: fcvtas v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.fcvtau.v2i32.v2f32(<2 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vcvta_u32_f32
   return vcvta_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTA1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.fcvtau.v4i32.v4f32(<4 x float> [[VCVTA_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTA1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vcvtaq_u32_f32
   return vcvtaq_u32_f32(a);
-  // CHECK: fcvtau v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcvtaq_u64_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VCVTA_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VCVTA1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.fcvtau.v2i64.v2f64(<2 x double> [[VCVTA_I]]) #2
+// CHECK:   ret <2 x i64> [[VCVTA1_I]]
 uint64x2_t test_vcvtaq_u64_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vcvtaq_u64_f64
   return vcvtaq_u64_f64(a);
-  // CHECK: fcvtau v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrsqrte_f32
   return vrsqrte_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f32
   return vrsqrteq_f32(a);
-  // CHECK: frsqrte v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrsqrteq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frsqrte.v2f64(<2 x double> [[VRSQRTEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRSQRTEQ_V1_I]]
 float64x2_t test_vrsqrteq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrsqrteq_f64
   return vrsqrteq_f64(a);
-  // CHECK: frsqrte v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_f32
   return vrecpe_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_f32
   return vrecpeq_f32(a);
-  // CHECK: frecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vrecpeq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <2 x double> @llvm.aarch64.neon.frecpe.v2f64(<2 x double> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <2 x double> [[VRECPEQ_V1_I]]
 float64x2_t test_vrecpeq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vrecpeq_f64
   return vrecpeq_f64(a);
-  // CHECK: frecpe v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.urecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vrecpe_u32
   return vrecpe_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.urecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vrecpeq_u32
   return vrecpeq_u32(a);
-  // CHECK: urecpe v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vsqrt_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[TMP1]]) #2
+// CHECK:   ret <2 x float> [[VSQRT_I]]
 float32x2_t test_vsqrt_f32(float32x2_t a) {
-  //CHECK-LABEL: test_vsqrt_f32
   return vsqrt_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsqrtq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSQRT_I:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]]) #2
+// CHECK:   ret <4 x float> [[VSQRT_I]]
 float32x4_t test_vsqrtq_f32(float32x4_t a) {
-  //CHECK-LABEL: test_vsqrtq_f32
   return vsqrtq_f32(a);
-  // CHECK: fsqrt v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vsqrtq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VSQRT_I:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]]) #2
+// CHECK:   ret <2 x double> [[VSQRT_I]]
 float64x2_t test_vsqrtq_f64(float64x2_t a) {
-  //CHECK-LABEL: test_vsqrtq_f64
   return vsqrtq_f64(a);
-  // CHECK: fsqrt v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_s32
   return vcvt_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
-  //CHECK-LABEL: test_vcvt_f32_u32
   return vcvt_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2s, v{{[0-9]+}}.2s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_s32
   return vcvtq_f32_s32(a);
-  //CHECK: scvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
-  //CHECK-LABEL: test_vcvtq_f32_u32
   return vcvtq_f32_u32(a);
-  //CHECK: ucvtf v{{[0-9]+}}.4s, v{{[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_s64(int64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_s64
   return vcvtq_f64_s64(a);
-  //CHECK: scvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcvtq_f64_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i64> [[TMP1]] to <2 x double>
+// CHECK:   ret <2 x double> [[VCVT_I]]
 float64x2_t test_vcvtq_f64_u64(uint64x2_t a) {
-  //CHECK-LABEL: test_vcvtq_f64_u64
   return vcvtq_f64_u64(a);
-  //CHECK: ucvtf v{{[0-9]+}}.2d, v{{[0-9]+}}.2d
 }
diff --git a/test/CodeGen/aarch64-neon-perm.c b/test/CodeGen/aarch64-neon-perm.c
index 07edc11a255a0..ca9f15d22c677 100644
--- a/test/CodeGen/aarch64-neon-perm.c
+++ b/test/CodeGen/aarch64-neon-perm.c
@@ -1,1092 +1,2279 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_s8
   return vuzp1_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_s8
   return vuzp1q_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_s16
   return vuzp1_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_s16
   return vuzp1q_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_s32
   return vuzp1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_s32
   return vuzp1q_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_s64
   return vuzp1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_u8
   return vuzp1_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_u8
   return vuzp1q_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_u16
   return vuzp1_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_u16
   return vuzp1q_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_u32
   return vuzp1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_u32
   return vuzp1q_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_u64
   return vuzp1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp1_f32
   return vuzp1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp1q_f32
   return vuzp1q_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_f64
   return vuzp1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp1_p8
   return vuzp1_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp1q_p8
   return vuzp1q_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp1_p16
   return vuzp1_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp1q_p16
   return vuzp1q_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vuzp2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_s8
   return vuzp2_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vuzp2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_s8
   return vuzp2q_s8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vuzp2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_s16
   return vuzp2_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vuzp2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_s16
   return vuzp2q_s16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vuzp2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_s32
   return vuzp2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vuzp2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_s32
   return vuzp2q_s32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vuzp2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_s64
   return vuzp2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vuzp2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_u8
   return vuzp2_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vuzp2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_u8
   return vuzp2q_u8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vuzp2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_u16
   return vuzp2_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vuzp2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_u16
   return vuzp2q_u16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vuzp2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vuzp2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_u32
   return vuzp2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vuzp2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vuzp2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_u32
   return vuzp2q_u32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vuzp2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_u64
   return vuzp2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vuzp2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vuzp2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp2_f32
   return vuzp2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vuzp2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vuzp2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzp2q_f32
   return vuzp2q_f32(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vuzp2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vuzp2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_f64
   return vuzp2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vuzp2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vuzp2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp2_p8
   return vuzp2_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vuzp2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vuzp2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzp2q_p8
   return vuzp2q_p8(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vuzp2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vuzp2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp2_p16
   return vuzp2_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vuzp2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vuzp2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzp2q_p16
   return vuzp2q_p16(a, b);
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip1_s8
   return vzip1_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_s8
   return vzip1q_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip1_s16
   return vzip1_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_s16
   return vzip1q_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip1_s32
   return vzip1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_s32
   return vzip1q_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_s64
   return vzip1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip1_u8
   return vzip1_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_u8
   return vzip1q_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip1_u16
   return vzip1_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_u16
   return vzip1q_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip1_u32
   return vzip1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_u32
   return vzip1q_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_u64
   return vzip1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip1_f32
   return vzip1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip1q_f32
   return vzip1q_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_f64
   return vzip1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip1_p8
   return vzip1_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip1q_p8
   return vzip1q_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip1_p16
   return vzip1_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip1q_p16
   return vzip1q_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vzip2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip2_s8
   return vzip2_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vzip2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_s8
   return vzip2q_s8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vzip2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip2_s16
   return vzip2_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vzip2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_s16
   return vzip2q_s16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vzip2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip2_s32
   return vzip2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vzip2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_s32
   return vzip2q_s32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vzip2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_s64
   return vzip2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vzip2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip2_u8
   return vzip2_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vzip2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_u8
   return vzip2q_u8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vzip2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip2_u16
   return vzip2_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vzip2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_u16
   return vzip2q_u16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vzip2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vzip2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip2_u32
   return vzip2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vzip2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vzip2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_u32
   return vzip2q_u32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vzip2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_u64
   return vzip2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vzip2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vzip2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip2_f32
   return vzip2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vzip2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vzip2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzip2q_f32
   return vzip2q_f32(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vzip2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vzip2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_f64
   return vzip2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vzip2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vzip2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip2_p8
   return vzip2_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vzip2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vzip2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzip2q_p8
   return vzip2q_p8(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vzip2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vzip2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip2_p16
   return vzip2_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vzip2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vzip2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzip2q_p16
   return vzip2q_p16(a, b);
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_s8
   return vtrn1_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_s8
   return vtrn1q_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_s16
   return vtrn1_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_s16
   return vtrn1q_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_s32
   return vtrn1_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_s32
   return vtrn1q_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_s64
   return vtrn1q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_u8
   return vtrn1_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_u8
   return vtrn1q_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_u16
   return vtrn1_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_u16
   return vtrn1q_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn1_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_u32
   return vtrn1_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn1q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_u32
   return vtrn1q_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_u64
   return vtrn1q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn1_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn1_f32
   return vtrn1_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn1q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn1q_f32
   return vtrn1q_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn1q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_f64
   return vtrn1q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[1\], v[0-9]+.d\[0\]|zip1 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn1_p8
   return vtrn1_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn1q_p8
   return vtrn1q_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn1_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn1_p16
   return vtrn1_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn1q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn1q_p16
   return vtrn1q_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_s8
   return vtrn2_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_s8
   return vtrn2q_s8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_s16
   return vtrn2_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_s16
   return vtrn2q_s16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_s32
   return vtrn2_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_s32
   return vtrn2q_s32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_s64
   return vtrn2q_s64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_u8
   return vtrn2_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_u8
   return vtrn2q_u8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_u16
   return vtrn2_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_u16
   return vtrn2q_u16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vtrn2_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_u32
   return vtrn2_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vtrn2q_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_u32
   return vtrn2q_u32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_u64
   return vtrn2q_u64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <2 x float> @test_vtrn2_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn2_f32
   return vtrn2_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v0.2s, v0.2s, v1.2s}}
 }
 
+// CHECK-LABEL: define <4 x float> @test_vtrn2q_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrn2q_f32
   return vtrn2q_f32(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
 
+// CHECK-LABEL: define <2 x double> @test_vtrn2q_f64(<2 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_f64
   return vtrn2q_f64(a, b);
-  // CHECK: {{ins v[0-9]+.d\[0\], v[0-9]+.d\[1\]|zip2 v0.2d, v0.2d, v1.2d}}
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtrn2_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn2_p8
   return vtrn2_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vtrn2q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrn2q_p8
   return vtrn2q_p8(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vtrn2_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn2_p16
   return vtrn2_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vtrn2q_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrn2q_p16
   return vtrn2q_p16(a, b);
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vuzp_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vuzp_s8
   return vuzp_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vuzp_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vuzp_s16
   return vuzp_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vuzp_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vuzp_s32
   return vuzp_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vuzp_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vuzp_u8
   return vuzp_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vuzp_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vuzp_u16
   return vuzp_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vuzp_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vuzp_u32
   return vuzp_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vuzp_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vuzp_f32
   return vuzp_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vuzp_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vuzp_p8
   return vuzp_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vuzp_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vuzp_p16
   return vuzp_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: uzp2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vuzpq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_s8
   return vuzpq_s8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vuzpq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_s16
   return vuzpq_s16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vuzpq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_s32
   return vuzpq_s32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vuzpq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_u8
   return vuzpq_u8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vuzpq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_u16
   return vuzpq_u16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vuzpq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_u32
   return vuzpq_u32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vuzpq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vuzpq_f32
   return vuzpq_f32(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: uzp2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vuzpq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vuzpq_p8
   return vuzpq_p8(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: uzp2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vuzpq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vuzpq_p16
   return vuzpq_p16(a, b);
-  // CHECK: uzp1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: uzp2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vzip_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vzip_s8
   return vzip_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vzip_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vzip_s16
   return vzip_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vzip_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vzip_s32
   return vzip_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vzip_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vzip_u8
   return vzip_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vzip_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vzip_u16
   return vzip_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vzip_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vzip_u32
   return vzip_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vzip_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vzip_f32
   return vzip_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vzip_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vzip_p8
   return vzip_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: zip2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vzip_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vzip_p16
   return vzip_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: zip2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vzipq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vzipq_s8
   return vzipq_s8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vzipq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vzipq_s16
   return vzipq_s16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vzipq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vzipq_s32
   return vzipq_s32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vzipq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vzipq_u8
   return vzipq_u8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vzipq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vzipq_u16
   return vzipq_u16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vzipq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vzipq_u32
   return vzipq_u32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vzipq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vzipq_f32
   return vzipq_f32(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: zip2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vzipq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vzipq_p8
   return vzipq_p8(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: zip2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vzipq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vzipq_p16
   return vzipq_p16(a, b);
-  // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
 
+// CHECK-LABEL: define %struct.int8x8x2_t @test_vtrn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x8x2_t, %struct.int8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int8x8x2_t [[TMP8]]
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtrn_s8
   return vtrn_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define %struct.int16x4x2_t @test_vtrn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x4x2_t, %struct.int16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int16x4x2_t [[TMP12]]
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vtrn_s16
   return vtrn_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int32x2x2_t @test_vtrn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x2x2_t, %struct.int32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.int32x2x2_t [[TMP12]]
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vtrn_s32
   return vtrn_s32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.uint8x8x2_t @test_vtrn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint8x8x2_t [[TMP8]]
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtrn_u8
   return vtrn_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.uint16x4x2_t @test_vtrn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint16x4x2_t [[TMP12]]
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vtrn_u16
   return vtrn_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.uint32x2x2_t @test_vtrn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x i32>] [[TMP11]], [2 x <2 x i32>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.uint32x2x2_t [[TMP12]]
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vtrn_u32
   return vtrn_u32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.float32x2x2_t @test_vtrn_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x2x2_t [[TMP9]], 0
+// CHECK:   store [2 x <2 x float>] [[TMP11]], [2 x <2 x float>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x2x2_t, %struct.float32x2x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.float32x2x2_t [[TMP12]]
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vtrn_f32
   return vtrn_f32(a, b);
-  // CHECK: {{ins v[0-9]+.s\[1\], v[0-9]+.s\[0\]|zip1 v2.2s, v0.2s, v1.2s}}
-  // CHECK: {{ins v[0-9]+.s\[0\], v[0-9]+.s\[1\]|zip2 v1.2s, v0.2s, v1.2s}}
 }
+// CHECK-LABEL: define %struct.poly8x8x2_t @test_vtrn_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x8x2_t [[TMP5]], 0
+// CHECK:   store [2 x <8 x i8>] [[TMP7]], [2 x <8 x i8>]* [[TMP6]], align 8
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly8x8x2_t [[TMP8]]
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vtrn_p8
   return vtrn_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
+// CHECK-LABEL: define %struct.poly16x4x2_t @test_vtrn_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 16, i32 8, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL_I]], align 8
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i16>] [[TMP11]], [2 x <4 x i16>]* [[TMP10]], align 8
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly16x4x2_t [[TMP12]]
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vtrn_p16
   return vtrn_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
-  // CHECK: trn2 {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
 }
+// CHECK-LABEL: define %struct.int8x16x2_t @test_vtrnq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.int8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.int8x16x2_t, %struct.int8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int8x16x2_t [[TMP8]]
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_s8
   return vtrnq_s8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.int16x8x2_t @test_vtrnq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int16x8x2_t, %struct.int16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int16x8x2_t [[TMP12]]
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_s16
   return vtrnq_s16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.int32x4x2_t @test_vtrnq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.int32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.int32x4x2_t, %struct.int32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.int32x4x2_t [[TMP12]]
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_s32
   return vtrnq_s32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.uint8x16x2_t @test_vtrnq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.uint8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint8x16x2_t [[TMP8]]
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_u8
   return vtrnq_u8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.uint16x8x2_t @test_vtrnq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint16x8x2_t [[TMP12]]
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_u16
   return vtrnq_u16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
+// CHECK-LABEL: define %struct.uint32x4x2_t @test_vtrnq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.uint32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x i32>] [[TMP11]], [2 x <4 x i32>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.uint32x4x2_t [[TMP12]]
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_u32
   return vtrnq_u32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.float32x4x2_t @test_vtrnq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.float32x4x2_t [[TMP9]], 0
+// CHECK:   store [2 x <4 x float>] [[TMP11]], [2 x <4 x float>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.float32x4x2_t, %struct.float32x4x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.float32x4x2_t [[TMP12]]
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vtrnq_f32
   return vtrnq_f32(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
-  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
 }
+// CHECK-LABEL: define %struct.poly8x16x2_t @test_vtrnq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP3]], i8* [[TMP4]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP5:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP7:%.*]] = extractvalue %struct.poly8x16x2_t [[TMP5]], 0
+// CHECK:   store [2 x <16 x i8>] [[TMP7]], [2 x <16 x i8>]* [[TMP6]], align 16
+// CHECK:   [[TMP8:%.*]] = load %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly8x16x2_t [[TMP8]]
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vtrnq_p8
   return vtrnq_p8(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
-  // CHECK: trn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
+// CHECK-LABEL: define %struct.poly16x8x2_t @test_vtrnq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[RETVAL_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* [[RETVAL_I]] to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP7]], i8* [[TMP8]], i64 32, i32 16, i1 false) #2
+// CHECK:   [[TMP9:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL_I]], align 16
+// CHECK:   [[TMP10:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], i32 0, i32 0
+// CHECK:   [[TMP11:%.*]] = extractvalue %struct.poly16x8x2_t [[TMP9]], 0
+// CHECK:   store [2 x <8 x i16>] [[TMP11]], [2 x <8 x i16>]* [[TMP10]], align 16
+// CHECK:   [[TMP12:%.*]] = load %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly16x8x2_t [[TMP12]]
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vtrnq_p16
   return vtrnq_p16(a, b);
-  // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
-  // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
 }
diff --git a/test/CodeGen/aarch64-neon-scalar-copy.c b/test/CodeGen/aarch64-neon-scalar-copy.c
index a50a0b964441e..90fceb44ed908 100644
--- a/test/CodeGen/aarch64-neon-scalar-copy.c
+++ b/test/CodeGen/aarch64-neon-scalar-copy.c
@@ -1,173 +1,228 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vdups_lane_f32
+// CHECK-LABEL: define float @test_vdups_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VDUPS_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VDUPS_LANE]]
 float32_t test_vdups_lane_f32(float32x2_t a) {
   return vdups_lane_f32(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_f64
+// CHECK-LABEL: define double @test_vdupd_lane_f64(<1 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VDUPD_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   ret double [[VDUPD_LANE]]
 float64_t test_vdupd_lane_f64(float64x1_t a) {
   return vdupd_lane_f64(a, 0);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_f32
+// CHECK-LABEL: define float @test_vdups_laneq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vdups_laneq_f32(float32x4_t a) {
   return vdups_laneq_f32(a, 3);
-// CHECK: ret
-// CHECK-NOT: dup {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_f64
+// CHECK-LABEL: define double @test_vdupd_laneq_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   ret double [[VGETQ_LANE]]
 float64_t test_vdupd_laneq_f64(float64x2_t a) {
   return vdupd_laneq_f64(a, 1);
-// CHECK: ret
-// CHECK-NOT: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_s8
+// CHECK-LABEL: define i8 @test_vdupb_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vdupb_lane_s8(int8x8_t a) {
   return vdupb_lane_s8(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_s16
+// CHECK-LABEL: define i16 @test_vduph_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vduph_lane_s16(int16x4_t a) {
   return vduph_lane_s16(a, 3);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_s32
+// CHECK-LABEL: define i32 @test_vdups_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vdups_lane_s32(int32x2_t a) {
   return vdups_lane_s32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_s64
+// CHECK-LABEL: define i64 @test_vdupd_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vdupd_lane_s64(int64x1_t a) {
   return vdupd_lane_s64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vdupb_lane_u8
+// CHECK-LABEL: define i8 @test_vdupb_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vdupb_lane_u8(uint8x8_t a) {
   return vdupb_lane_u8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
 
-// CHECK-LABEL: test_vduph_lane_u16
+// CHECK-LABEL: define i16 @test_vduph_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vduph_lane_u16(uint16x4_t a) {
   return vduph_lane_u16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
 
-// CHECK-LABEL: test_vdups_lane_u32
+// CHECK-LABEL: define i32 @test_vdups_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vdups_lane_u32(uint32x2_t a) {
   return vdups_lane_u32(a, 1);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vdupd_lane_u64
+// CHECK-LABEL: define i64 @test_vdupd_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vdupd_lane_u64(uint64x1_t a) {
   return vdupd_lane_u64(a, 0);
-// CHECK: fmov {{x[0-9]+}}, {{d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vdupb_laneq_s8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vdupb_laneq_s8(int8x16_t a) {
   return vdupb_laneq_s8(a, 15);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_s16
+// CHECK-LABEL: define i16 @test_vduph_laneq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vduph_laneq_s16(int16x8_t a) {
   return vduph_laneq_s16(a, 7);
-// CHECK: {{umov|smov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_s32
+// CHECK-LABEL: define i32 @test_vdups_laneq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vdups_laneq_s32(int32x4_t a) {
   return vdups_laneq_s32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_s64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vdupd_laneq_s64(int64x2_t a) {
   return vdupd_laneq_s64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vdupb_laneq_u8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vdupb_laneq_u8(uint8x16_t a) {
   return vdupb_laneq_u8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
 
-// CHECK-LABEL: test_vduph_laneq_u16
+// CHECK-LABEL: define i16 @test_vduph_laneq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vduph_laneq_u16(uint16x8_t a) {
   return vduph_laneq_u16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vdups_laneq_u32
+// CHECK-LABEL: define i32 @test_vdups_laneq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vdups_laneq_u32(uint32x4_t a) {
   return vdups_laneq_u32(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
 
-// CHECK-LABEL: test_vdupd_laneq_u64
+// CHECK-LABEL: define i64 @test_vdupd_laneq_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vdupd_laneq_u64(uint64x2_t a) {
   return vdupd_laneq_u64(a, 1);
-// CHECK: {{mov|umov}} {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vdupb_lane_p8
+// CHECK-LABEL: define i8 @test_vdupb_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vdupb_lane_p8(poly8x8_t a) {
   return vdupb_lane_p8(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[7]
 }
 
-// CHECK-LABEL: test_vduph_lane_p16
+// CHECK-LABEL: define i16 @test_vduph_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vduph_lane_p16(poly16x4_t a) {
   return vduph_lane_p16(a, 3);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vdupb_laneq_p8
+// CHECK-LABEL: define i8 @test_vdupb_laneq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vdupb_laneq_p8(poly8x16_t a) {
   return vdupb_laneq_p8(a, 15);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.b[15]
 }
 
-// CHECK-LABEL: test_vduph_laneq_p16
+// CHECK-LABEL: define i16 @test_vduph_laneq_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vduph_laneq_p16(poly16x8_t a) {
   return vduph_laneq_p16(a, 7);
-// CHECK: {{mov|umov}} {{w[0-9]+}}, {{v[0-9]+}}.h[7]
 }
 
diff --git a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
index 4c2f4d72d8910..39aab2540e8ac 100644
--- a/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
+++ b/test/CodeGen/aarch64-neon-scalar-x-indexed-elem.c
@@ -1,259 +1,509 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-cpu cyclone \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:  -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
 
+// CHECK-LABEL: define float @test_vmuls_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGET_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmuls_lane_f32
   return vmuls_lane_f32(a, b, 1);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define double @test_vmuld_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGET_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_lane_f64(float64_t a, float64x1_t b) {
-  // CHECK-LABEL: test_vmuld_lane_f64
   return vmuld_lane_f64(a, b, 0);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmuls_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[MUL:%.*]] = fmul float %a, [[VGETQ_LANE]]
+// CHECK:   ret float [[MUL]]
 float32_t test_vmuls_laneq_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmuls_laneq_f32
   return vmuls_laneq_f32(a, b, 3);
-  // CHECK: fmul {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmuld_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[MUL:%.*]] = fmul double %a, [[VGETQ_LANE]]
+// CHECK:   ret double [[MUL]]
 float64_t test_vmuld_laneq_f64(float64_t a, float64x2_t b) {
-  // CHECK-LABEL: test_vmuld_laneq_f64
   return vmuld_laneq_f64(a, b, 1);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x double> @test_vmul_n_f64(<1 x double> %a, double %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP1]] to double
+// CHECK:   [[TMP3:%.*]] = fmul double [[TMP2]], %b
+// CHECK:   [[TMP4:%.*]] = bitcast double [[TMP3]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP4]]
 float64x1_t test_vmul_n_f64(float64x1_t a, float64_t b) {
-  // CHECK-LABEL: test_vmul_n_f64
   return vmul_n_f64(a, b);
-  // CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define float @test_vmulxs_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGET_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_lane_f32(float32_t a, float32x2_t b) {
-// CHECK-LABEL: test_vmulxs_lane_f32
   return vmulxs_lane_f32(a, b, 1);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
+// CHECK-LABEL: define float @test_vmulxs_laneq_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[VMULXS_F32_I:%.*]] = call float @llvm.aarch64.neon.fmulx.f32(float %a, float [[VGETQ_LANE]]) #2
+// CHECK:   ret float [[VMULXS_F32_I]]
 float32_t test_vmulxs_laneq_f32(float32_t a, float32x4_t b) {
-// CHECK-LABEL: test_vmulxs_laneq_f32
   return vmulxs_laneq_f32(a, b, 3);
-// CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
+// CHECK-LABEL: define double @test_vmulxd_lane_f64(double %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGET_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_lane_f64(float64_t a, float64x1_t b) {
-// CHECK-LABEL: test_vmulxd_lane_f64
   return vmulxd_lane_f64(a, b, 0);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
+// CHECK-LABEL: define double @test_vmulxd_laneq_f64(double %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double %a, double [[VGETQ_LANE]]) #2
+// CHECK:   ret double [[VMULXD_F64_I]]
 float64_t test_vmulxd_laneq_f64(float64_t a, float64x2_t b) {
-// CHECK-LABEL: test_vmulxd_laneq_f64
   return vmulxd_laneq_f64(a, b, 1);
-// CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64(<1 x double> %a, <1 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE6:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE6]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64(float64x1_t a, float64x1_t b) {
   return vmulx_lane_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
 
-// CHECK-LABEL: test_vmulx_laneq_f64_0
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_0(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_0(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 0);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_1
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_1(<1 x double> %a, <2 x double> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP5]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_1(float64x1_t a, float64x2_t b) {
   return vmulx_laneq_f64(a, b, 1);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
 
-// CHECK-LABEL: test_vfmas_lane_f32
+// CHECK-LABEL: define float @test_vfmas_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float %b, float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmas_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmas_lane_f32(a, b, c, 1);
-  // CHECK: fmla {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfmad_lane_f64
+// CHECK-LABEL: define double @test_vfmad_lane_f64(double %a, double %b, <1 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <1 x double> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_lane_f64(float64_t a, float64_t b, float64x1_t c) {
   return vfmad_lane_f64(a, b, c, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfmad_laneq_f64
+// CHECK-LABEL: define double @test_vfmad_laneq_f64(double %a, double %b, <2 x double> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call double @llvm.fma.f64(double %b, double [[EXTRACT]], double %a)
+// CHECK:   ret double [[TMP2]]
 float64_t test_vfmad_laneq_f64(float64_t a, float64_t b, float64x2_t c) {
   return vfmad_laneq_f64(a, b, c, 1);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
-// CHECK-LABEL: test_vfmss_lane_f32
+// CHECK-LABEL: define float @test_vfmss_lane_f32(float %a, float %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB:%.*]] = fsub float -0.000000e+00, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = call float @llvm.fma.f32(float [[SUB]], float [[EXTRACT]], float %a)
+// CHECK:   ret float [[TMP2]]
 float32_t test_vfmss_lane_f32(float32_t a, float32_t b, float32x2_t c) {
   return vfmss_lane_f32(a, b, c, 1);
-  // CHECK: fmls {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vfma_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK:   ret <1 x double> [[FMLA2]]
 float64x1_t test_vfma_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfma_lane_f64(a, b, v, 0);
-  // CHECK: {{fmla|fmadd}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfms_lane_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_lane_f64(<1 x double> %a, <1 x double> %b, <1 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> %v to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x double> [[TMP3]], <1 x double> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   [[FMLA:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x double>
+// CHECK:   [[FMLA1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double>
+// CHECK:   [[FMLA2:%.*]] = call <1 x double> @llvm.fma.v1f64(<1 x double> [[FMLA]], <1 x double> [[LANE]], <1 x double> [[FMLA1]])
+// CHECK:   ret <1 x double> [[FMLA2]]
 float64x1_t test_vfms_lane_f64(float64x1_t a, float64x1_t b, float64x1_t v) {
   return vfms_lane_f64(a, b, v, 0);
-  // CHECK: {{fmls|fmsub}} {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+.d\[0\]|d[0-9]+}}
 }
 
-// CHECK-LABEL: test_vfma_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfma_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfma_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfma_laneq_f64(a, b, v, 0);
-  // CHECK: fmla {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vfms_laneq_f64
+// CHECK-LABEL: define <1 x double> @test_vfms_laneq_f64(<1 x double> %a, <1 x double> %b, <2 x double> %v) #0 {
+// CHECK:   [[SUB:%.*]] = fsub <1 x double> <double -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x double> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x double> [[SUB]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %v to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to double
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to double
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[EXTRACT:%.*]] = extractelement <2 x double> [[TMP5]], i32 0
+// CHECK:   [[TMP6:%.*]] = call double @llvm.fma.f64(double [[TMP4]], double [[EXTRACT]], double [[TMP3]])
+// CHECK:   [[TMP7:%.*]] = bitcast double [[TMP6]] to <1 x double>
+// CHECK:   ret <1 x double> [[TMP7]]
 float64x1_t test_vfms_laneq_f64(float64x1_t a, float64x1_t b, float64x2_t v) {
   return vfms_laneq_f64(a, b, v, 0);
-  // CHECK: fmls {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}.d[0]
 }
 
-// CHECK-LABEL: test_vqdmullh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmullh_lane_s16(a, b, 3);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9].4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulls_lane_s32(a, b, 1);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmullh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmullh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULLH_S16_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i32> [[VQDMULLH_S16_I]], i64 0
+// CHECK:   ret i32 [[TMP4]]
 int32_t test_vqdmullh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmullh_laneq_s16(a, b, 7);
-  // CHECK: sqdmull {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmulls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmulls_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULLS_S32_I:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i64 [[VQDMULLS_S32_I]]
 int64_t test_vqdmulls_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulls_laneq_s32(a, b, 3);
-  // CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqdmulhh_lane_s16(a, b, 3);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqdmulhs_lane_s32(a, b, 1);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQDMULHS_S32_I]]
 int32_t test_vqdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhh_lane_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGET_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_lane_s16(int16_t a, int16x4_t b) {
   return vqrdmulhh_lane_s16(a, b, 3);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqrdmulhs_lane_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGET_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_lane_s32(int32_t a, int32x2_t b) {
   return vqrdmulhs_lane_s32(a, b, 1);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhh_laneq_s16
+// CHECK-LABEL: define i16 @test_vqrdmulhh_laneq_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %a, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[VGETQ_LANE]], i64 0
+// CHECK:   [[VQRDMULHH_S16_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]]) #2
+// CHECK:   [[TMP4:%.*]] = extractelement <4 x i16> [[VQRDMULHH_S16_I]], i64 0
+// CHECK:   ret i16 [[TMP4]]
 int16_t test_vqrdmulhh_laneq_s16(int16_t a, int16x8_t b) {
   return vqrdmulhh_laneq_s16(a, b, 7);
-// CHECK: sqrdmulh {{h[0-9]+|v[0-9]+.4h}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
 
-// CHECK-LABEL: test_vqrdmulhs_laneq_s32
+// CHECK-LABEL: define i32 @test_vqrdmulhs_laneq_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQRDMULHS_S32_I:%.*]] = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %a, i32 [[VGETQ_LANE]]) #2
+// CHECK:   ret i32 [[VQRDMULHS_S32_I]]
 int32_t test_vqrdmulhs_laneq_s32(int32_t a, int32x4_t b) {
   return vqrdmulhs_laneq_s32(a, b, 3);
-// CHECK: sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlalh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlalh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlalh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlals_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlals_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlals_lane_s32(a, b, c, 1);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlalh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlalh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqadd.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlalh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlalh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlal {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlals_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlals_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqadd.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlals_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlals_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlal {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vqdmlslh_lane_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_lane_s16(i32 %a, i16 %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlslh_lane_s16(int32_t a, int16_t b, int16x4_t c) {
   return vqdmlslh_lane_s16(a, b, c, 3);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[3]
 }
 
-// CHECK-LABEL: test_vqdmlsls_lane_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_lane_s32(i64 %a, i32 %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlsls_lane_s32(int64_t a, int32_t b, int32x2_t c) {
   return vqdmlsls_lane_s32(a, b, c, 1);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[1]
 }
 
-// CHECK-LABEL: test_vqdmlslh_laneq_s16
+// CHECK-LABEL: define i32 @test_vqdmlslh_laneq_s16(i32 %a, i16 %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = insertelement <4 x i16> undef, i16 %b, i64 0
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[LANE]], i64 0
+// CHECK:   [[VQDMLXL:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[TMP2]], <4 x i16> [[TMP3]])
+// CHECK:   [[LANE0:%.*]] = extractelement <4 x i32> [[VQDMLXL]], i64 0
+// CHECK:   [[VQDMLXL1:%.*]] = call i32 @llvm.aarch64.neon.sqsub.i32(i32 %a, i32 [[LANE0]])
+// CHECK:   ret i32 [[VQDMLXL1]]
 int32_t test_vqdmlslh_laneq_s16(int32_t a, int16_t b, int16x8_t c) {
   return vqdmlslh_laneq_s16(a, b, c, 7);
-// CHECK: sqdmlsl {{s[0-9]+|v[0-9]+.4s}}, {{h[0-9]+|v[0-9]+.4h}}, {{v[0-9]+}}.h[7]
 }
 
-// CHECK-LABEL: test_vqdmlsls_laneq_s32
+// CHECK-LABEL: define i64 @test_vqdmlsls_laneq_s32(i64 %a, i32 %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[VQDMLXL:%.*]] = call i64 @llvm.aarch64.neon.sqdmulls.scalar(i32 %b, i32 [[LANE]])
+// CHECK:   [[VQDMLXL1:%.*]] = call i64 @llvm.aarch64.neon.sqsub.i64(i64 %a, i64 [[VQDMLXL]])
+// CHECK:   ret i64 [[VQDMLXL1]]
 int64_t test_vqdmlsls_laneq_s32(int64_t a, int32_t b, int32x4_t c) {
   return vqdmlsls_laneq_s32(a, b, c, 3);
-// CHECK: sqdmlsl {{d[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}.s[3]
 }
 
-// CHECK-LABEL: test_vmulx_lane_f64_0:
+// CHECK-LABEL: define <1 x double> @test_vmulx_lane_f64_0() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x double> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x double>
+// CHECK:   [[VGET_LANE7:%.*]] = extractelement <1 x double> [[TMP5]], i32 0
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGET_LANE7]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_lane_f64_0() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -262,15 +512,24 @@ float64x1_t test_vmulx_lane_f64_0() {
       arg1 = vcreate_f64(UINT64_C(0x3fd6304bc43ab5c2));
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       result = vmulx_lane_f64(arg1, arg2, 0);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
 
-// CHECK-LABEL: test_vmulx_laneq_f64_2:
+// CHECK-LABEL: define <1 x double> @test_vmulx_laneq_f64_2() #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 4599917171378402754 to <1 x double>
+// CHECK:   [[TMP1:%.*]] = bitcast i64 4606655882138939123 to <1 x double>
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> [[TMP0]], <1 x double> [[TMP1]], <2 x i32> <i32 0, i32 1>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x double>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x double> [[TMP3]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x double> [[SHUFFLE_I]] to <16 x i8>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP5]], i32 1
+// CHECK:   [[VMULXD_F64_I:%.*]] = call double @llvm.aarch64.neon.fmulx.f64(double [[VGET_LANE]], double [[VGETQ_LANE]]) #2
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x double> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x double> [[TMP7]], double [[VMULXD_F64_I]], i32 0
+// CHECK:   ret <1 x double> [[VSET_LANE]]
 float64x1_t test_vmulx_laneq_f64_2() {
       float64x1_t arg1;
       float64x1_t arg2;
@@ -281,10 +540,5 @@ float64x1_t test_vmulx_laneq_f64_2() {
       arg2 = vcreate_f64(UINT64_C(0x3fee211e215aeef3));
       arg3 = vcombine_f64(arg1, arg2);
       result = vmulx_laneq_f64(arg1, arg3, 1);
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d0, [x[[ADDRLO]],
-// CHECK: adrp x[[ADDRLO:[0-9]+]]
-// CHECK: ldr d1, [x[[ADDRLO]],
-// CHECK: fmulx d0, d1, d0
       return result;
 }
diff --git a/test/CodeGen/aarch64-neon-shifts.c b/test/CodeGen/aarch64-neon-shifts.c
index 02d8ca155649a..66449f7fef4ca 100644
--- a/test/CodeGen/aarch64-neon-shifts.c
+++ b/test/CodeGen/aarch64-neon-shifts.c
@@ -1,6 +1,5 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN:   -ffp-contract=fast -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -25,19 +24,20 @@ uint8x8_t test_shift_vshr_umax(uint8x8_t a) {
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
diff --git a/test/CodeGen/aarch64-neon-tbl.c b/test/CodeGen/aarch64-neon-tbl.c
index 902fc45910870..0cc66453ac2b7 100644
--- a/test/CodeGen/aarch64-neon-tbl.c
+++ b/test/CodeGen/aarch64-neon-tbl.c
@@ -1,463 +1,1500 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_s8
   return vtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_s8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vqtbl1_s8(int8x16_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_s8
   return vqtbl1_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_s8
   return vtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_s8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 int8x8_t test_vqtbl2_s8(int8x16x2_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_s8
   return vqtbl2_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_s8
   return vtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_s8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 int8x8_t test_vqtbl3_s8(int8x16x3_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_s8
   return vqtbl3_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_s8
   return vtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_s8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 int8x8_t test_vqtbl4_s8(int8x16x4_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_s8
   return vqtbl4_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 int8x16_t test_vqtbl1q_s8(int8x16_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_s8
   return vqtbl1q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_s8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 int8x16_t test_vqtbl2q_s8(int8x16x2_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_s8
   return vqtbl2q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_s8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 int8x16_t test_vqtbl3q_s8(int8x16x3_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_s8
   return vqtbl3q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_s8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 int8x16_t test_vqtbl4q_s8(int8x16x4_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_s8
   return vqtbl4q_s8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_s8
   return vtbx1_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_s8
   return vtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_s8
   return vtbx3_s8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_s8
   return vtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_s8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vqtbx1_s8(int8x8_t a, int8x16_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_s8
   return vqtbx1_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_s8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 int8x8_t test_vqtbx2_s8(int8x8_t a, int8x16x2_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_s8
   return vqtbx2_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_s8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 int8x8_t test_vqtbx3_s8(int8x8_t a, int8x16x3_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_s8
   return vqtbx3_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_s8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 int8x8_t test_vqtbx4_s8(int8x8_t a, int8x16x4_t b, int8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_s8
   return vqtbx4_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 int8x16_t test_vqtbx1q_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_s8
   return vqtbx1q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_s8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 int8x16_t test_vqtbx2q_s8(int8x16_t a, int8x16x2_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_s8
   return vqtbx2q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_s8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 int8x16_t test_vqtbx3q_s8(int8x16_t a, int8x16x3_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_s8
   return vqtbx3q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_s8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 int8x16_t test_vqtbx4q_s8(int8x16_t a, int8x16x4_t b, int8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_s8
   return vqtbx4q_s8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_u8
   return vtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_u8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vqtbl1_u8(uint8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_u8
   return vqtbl1_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_u8
   return vtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_u8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 uint8x8_t test_vqtbl2_u8(uint8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_u8
   return vqtbl2_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_u8
   return vtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_u8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 uint8x8_t test_vqtbl3_u8(uint8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_u8
   return vqtbl3_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_u8
   return vtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_u8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 uint8x8_t test_vqtbl4_u8(uint8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_u8
   return vqtbl4_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 uint8x16_t test_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_u8
   return vqtbl1q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_u8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 uint8x16_t test_vqtbl2q_u8(uint8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_u8
   return vqtbl2q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_u8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 uint8x16_t test_vqtbl3q_u8(uint8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_u8
   return vqtbl3q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_u8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 uint8x16_t test_vqtbl4q_u8(uint8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_u8
   return vqtbl4q_u8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_u8
   return vtbx1_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_u8
   return vtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_u8
   return vtbx3_u8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_u8
   return vtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_u8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vqtbx1_u8(uint8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_u8
   return vqtbx1_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_u8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 uint8x8_t test_vqtbx2_u8(uint8x8_t a, uint8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_u8
   return vqtbx2_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_u8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 uint8x8_t test_vqtbx3_u8(uint8x8_t a, uint8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_u8
   return vqtbx3_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_u8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 uint8x8_t test_vqtbx4_u8(uint8x8_t a, uint8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_u8
   return vqtbx4_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 uint8x16_t test_vqtbx1q_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_u8
   return vqtbx1q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_u8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 uint8x16_t test_vqtbx2q_u8(uint8x16_t a, uint8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_u8
   return vqtbx2q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_u8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 uint8x16_t test_vqtbx3q_u8(uint8x16_t a, uint8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_u8
   return vqtbx3q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_u8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 uint8x16_t test_vqtbx4q_u8(uint8x16_t a, uint8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_u8
   return vqtbx4q_u8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL11_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl1_p8
   return vtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl1_p8(<16 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %a, <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vqtbl1_p8(poly8x16_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl1_p8
   return vqtbl1_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[A]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL13_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl2_p8
   return vtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl2_p8([2 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 poly8x8_t test_vqtbl2_p8(poly8x16x2_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl2_p8
   return vqtbl2_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[A]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL26_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl3_p8
   return vtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl3_p8([3 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 poly8x8_t test_vqtbl3_p8(poly8x16x3_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl3_p8
   return vqtbl3_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x <8 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[A]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL28_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vtbl4_p8
   return vtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbl4_p8([4 x <16 x i8>] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %b) #2
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 poly8x8_t test_vqtbl4_p8(poly8x16x4_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vqtbl4_p8
   return vqtbl4_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl1q_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL1_I]]
 poly8x16_t test_vqtbl1q_p8(poly8x16_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl1q_p8
   return vqtbl1q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl2q_p8([2 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[A]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL2_I]]
 poly8x16_t test_vqtbl2q_p8(poly8x16x2_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl2q_p8
   return vqtbl2q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl3q_p8([3 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[A]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL3_I]]
 poly8x16_t test_vqtbl3q_p8(poly8x16x3_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl3q_p8
   return vqtbl3q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbl4q_p8([4 x <16 x i8>] %a.coerce, <16 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[A]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %b) #2
+// CHECK:   ret <16 x i8> [[VTBL4_I]]
 poly8x16_t test_vqtbl4q_p8(poly8x16x4_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vqtbl4q_p8
   return vqtbl4q_p8(a, b);
-  // CHECK: tbl {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = shufflevector <8 x i8> %b, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP0:%.*]] = icmp uge <8 x i8> %c, <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>
+// CHECK:   [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = and <8 x i8> [[TMP1]], %a
+// CHECK:   [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx1_p8
   return vtbx1_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[B]].coerce, [2 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <8 x i8>], [2 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <8 x i8>] [[TMP0]], [2 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> [[VTBX1_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX13_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx2_p8
   return vtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[B]].coerce, [3 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <8 x i8>], [3 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <8 x i8>] [[TMP0]], [3 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> %c) #2
+// CHECK:   [[TMP4:%.*]] = icmp uge <8 x i8> %c, <i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24, i8 24>
+// CHECK:   [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = and <8 x i8> [[TMP5]], %a
+// CHECK:   [[TMP7:%.*]] = xor <8 x i8> [[TMP5]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[TMP8:%.*]] = and <8 x i8> [[TMP7]], [[VTBL26_I]]
+// CHECK:   [[VTBX_I:%.*]] = or <8 x i8> [[TMP6]], [[TMP8]]
+// CHECK:   ret <8 x i8> [[VTBX_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx3_p8
   return vtbx3_p8(a, b, c);
-  // CHECK: movi {{v[0-9]+.8b|d[0-9]+}}, #0
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
-  // CHECK: tbl {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
-  // CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x <8 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[B]].coerce, [4 x <8 x i8>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <8 x i8>], [4 x <8 x i8>]* [[COERCE_DIVE1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <8 x i8>] [[TMP0]], [4 x <8 x i8>]* [[COERCE_DIVE_I]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX28_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vtbx4_p8
   return vtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx1_p8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %a, <16 x i8> %b, <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vqtbx1_p8(poly8x8_t a, uint8x16_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx1_p8
   return vqtbx1_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx2_p8(<8 x i8> %a, [2 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 poly8x8_t test_vqtbx2_p8(poly8x8_t a, poly8x16x2_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx2_p8
   return vqtbx2_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx3_p8(<8 x i8> %a, [3 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 poly8x8_t test_vqtbx3_p8(poly8x8_t a, poly8x16x3_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx3_p8
   return vqtbx3_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vqtbx4_p8(<8 x i8> %a, [4 x <16 x i8>] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <8 x i8> %c) #2
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 poly8x8_t test_vqtbx4_p8(poly8x8_t a, poly8x16x4_t b, uint8x8_t c) {
-  // CHECK-LABEL: test_vqtbx4_p8
   return vqtbx4_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.8b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx1q_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX1_I]]
 poly8x16_t test_vqtbx1q_p8(poly8x16_t a, uint8x16_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx1q_p8
   return vqtbx1q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx2q_p8(<16 x i8> %a, [2 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[B]].coerce, [2 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [2 x <16 x i8>], [2 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [2 x <16 x i8>] [[TMP0]], [2 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX2_I]]
 poly8x16_t test_vqtbx2q_p8(poly8x16_t a, poly8x16x2_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx2q_p8
   return vqtbx2q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx3q_p8(<16 x i8> %a, [3 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[B]].coerce, [3 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [3 x <16 x i8>], [3 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [3 x <16 x i8>] [[TMP0]], [3 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX3_I]]
 poly8x16_t test_vqtbx3q_p8(poly8x16_t a, poly8x16x3_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx3q_p8
   return vqtbx3q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vqtbx4q_p8(<16 x i8> %a, [4 x <16 x i8>] %b.coerce, <16 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[B]].coerce, [4 x <16 x i8>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = load [4 x <16 x i8>], [4 x <16 x i8>]* [[COERCE_DIVE1]], align 16
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   store [4 x <16 x i8>] [[TMP0]], [4 x <16 x i8>]* [[COERCE_DIVE_I]], align 16
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL_I]], i64 0, i64 0
+// CHECK:   [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX_I]], align 16
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1_I]], i64 0, i64 1
+// CHECK:   [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2_I]], align 16
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3_I]], i64 0, i64 2
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4_I]], align 16
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5_I]], i64 0, i64 3
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6_I]], align 16
+// CHECK:   [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> %a, <16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> %c) #2
+// CHECK:   ret <16 x i8> [[VTBX4_I]]
 poly8x16_t test_vqtbx4q_p8(poly8x16_t a, poly8x16x4_t b, uint8x16_t c) {
-  // CHECK-LABEL: test_vqtbx4q_p8
   return vqtbx4q_p8(a, b, c);
-  // CHECK: tbx {{v[0-9]+}}.16b, {{{ ?v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b ?}}}, {{v[0-9]+}}.16b
 }
diff --git a/test/CodeGen/aarch64-neon-vcombine.c b/test/CodeGen/aarch64-neon-vcombine.c
index a750b8e70e8f5..482463cacfe0d 100644
--- a/test/CodeGen/aarch64-neon-vcombine.c
+++ b/test/CodeGen/aarch64-neon-vcombine.c
@@ -1,90 +1,103 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -S -O3 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -fallow-half-arguments-and-returns -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t low, int8x8_t high) {
-  // CHECK-LABEL: test_vcombine_s8:
   return vcombine_s8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t low, int16x4_t high) {
-  // CHECK-LABEL: test_vcombine_s16:
   return vcombine_s16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t low, int32x2_t high) {
-  // CHECK-LABEL: test_vcombine_s32:
   return vcombine_s32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t low, int64x1_t high) {
-  // CHECK-LABEL: test_vcombine_s64:
   return vcombine_s64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t low, uint8x8_t high) {
-  // CHECK-LABEL: test_vcombine_u8:
   return vcombine_u8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) {
-  // CHECK-LABEL: test_vcombine_u16:
   return vcombine_u16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %low, <2 x i32> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %low, <2 x i32> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t low, uint32x2_t high) {
-  // CHECK-LABEL: test_vcombine_u32:
   return vcombine_u32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t low, uint64x1_t high) {
-  // CHECK-LABEL: test_vcombine_u64:
   return vcombine_u64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64:
   return vcombine_p64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %low, <4 x half> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %low, <4 x half> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t low, float16x4_t high) {
-  // CHECK-LABEL: test_vcombine_f16:
   return vcombine_f16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %low, <2 x float> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %low, <2 x float> %high, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t low, float32x2_t high) {
-  // CHECK-LABEL: test_vcombine_f32:
   return vcombine_f32(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %low, <8 x i8> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %low, <8 x i8> %high, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t low, poly8x8_t high) {
-  // CHECK-LABEL: test_vcombine_p8:
   return vcombine_p8(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %low, <4 x i16> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t low, poly16x4_t high) {
-  // CHECK-LABEL: test_vcombine_p16:
   return vcombine_p16(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcombine_f64(<1 x double> %low, <1 x double> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x double> %low, <1 x double> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x double> [[SHUFFLE_I]]
 float64x2_t test_vcombine_f64(float64x1_t low, float64x1_t high) {
-  // CHECK-LABEL: test_vcombine_f64:
   return vcombine_f64(low, high);
-  // CHECK: ins	v0.d[1], v1.d[0]
 }
diff --git a/test/CodeGen/aarch64-neon-vget-hilo.c b/test/CodeGen/aarch64-neon-vget-hilo.c
index 0959d097bdc7e..f66bac6a6a3a5 100644
--- a/test/CodeGen/aarch64-neon-vget-hilo.c
+++ b/test/CodeGen/aarch64-neon-vget-hilo.c
@@ -1,176 +1,203 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix CHECK-COMMON --check-prefix CHECK-ARM64
-
+// RUN:  -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 // Test new aarch64 intrinsics and types
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s8:
   return vget_high_s8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s16:
   return vget_high_s16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s32:
   return vget_high_s32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_s64:
   return vget_high_s64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u8:
   return vget_high_u8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u16:
   return vget_high_u16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u32:
   return vget_high_u32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_u64:
   return vget_high_u64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_high_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_high_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p64:
   return vget_high_p64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f16:
   return vget_high_f16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f32:
   return vget_high_f32(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p8:
   return vget_high_p8(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_p16
   return vget_high_p16(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_high_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_high_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_high_f64
   return vget_high_f64(a);
-  // CHECK-ARM64: ext v0.16b, v0.16b, v0.16b, #8
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s8:
   return vget_low_s8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s16:
   return vget_low_s16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s32:
   return vget_low_s32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_s64:
   return vget_low_s64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u8:
   return vget_low_u8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u16:
   return vget_low_u16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u32:
   return vget_low_u32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_u64:
   return vget_low_u64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vget_low_p64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 poly64x1_t test_vget_low_p64(poly64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p64:
   return vget_low_p64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f16:
   return vget_low_f16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f32:
   return vget_low_f32(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p8:
   return vget_low_p8(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_p16:
   return vget_low_p16(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
+// CHECK-LABEL: define <1 x double> @test_vget_low_f64(<2 x double> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x double> [[SHUFFLE_I]]
 float64x1_t test_vget_low_f64(float64x2_t a) {
-  // CHECK-COMMON-LABEL: test_vget_low_f64:
   return vget_low_f64(a);
-  // CHECK-COMMON-NEXT: ret
 }
 
diff --git a/test/CodeGen/aarch64-neon-vget.c b/test/CodeGen/aarch64-neon-vget.c
index 83c64943daa96..87afcee9c8401 100644
--- a/test/CodeGen/aarch64-neon-vget.c
+++ b/test/CodeGen/aarch64-neon-vget.c
@@ -1,348 +1,458 @@
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon \
-// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+// RUN:   -fallow-half-arguments-and-returns -emit-llvm -o - %s \
+// RUN: | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_u8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_u16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_u32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_s8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_s16(a, 3);
 }
 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_s32(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
-  // CHECK-LABEL: test_vget_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[7]
-  // CHECK-NEXT:  ret
   return vget_lane_p8(a, 7);
 }
 
+// CHECK-LABEL: define i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[3]
-  // CHECK-NEXT:  ret
   return vget_lane_p16(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vget_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[1]
-  // CHECK-NEXT:  ret
   return vget_lane_f32(a, 1);
 }
 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
-  // CHECK-LABEL: test_vget_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[1]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vget_lane_f16(a, 1);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s16(a, 7);
 }
 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGETQ_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s32:
-  // CHECK-NEXT:  mov.s  w0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s32(a, 3);
 }
 
+// CHECK-LABEL: define i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGETQ_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p8:
-  // CHECK-NEXT:  umov.b w0, v0[15]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p8(a, 15);
 }
 
+// CHECK-LABEL: define i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGETQ_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_p16:
-  // CHECK-NEXT:  umov.h w0, v0[7]
-  // CHECK-NEXT:  ret
   return vgetq_lane_p16(a, 7);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGETQ_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f32:
-  // CHECK-NEXT:  mov s0, v0[3]
-  // CHECK-NEXT:  ret
   return vgetq_lane_f32(a, 3);
 }
 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGETQ_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
-  // CHECK-LABEL: test_vgetq_lane_f16:
-  // CHECK-NEXT:  umov.h w8, v0[3]
-  // CHECK-NEXT:  fmov s0, w8
-  // CHECK-NEXT:  fcvt s0, h0
-  // CHECK-NEXT:  ret
   return vgetq_lane_f16(a, 3);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_s64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_s64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
-  // CHECK-LABEL: test_vget_lane_u64:
-  // CHECK-NEXT:  fmov x0, d0
-  // CHECK-NEXT:  ret
   return vget_lane_u64(a, 0);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_s64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_s64(a, 1);
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
-  // CHECK-LABEL: test_vgetq_lane_u64:
-  // CHECK-NEXT:  mov.d  x0, v0[1]
-  // CHECK-NEXT:  ret
   return vgetq_lane_u64(a, 1);
 }
 
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_u8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_u16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_u32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_u32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_s8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_s16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_s32:
-  // CHECK-NEXT:  ins.s v0[1], w0
-  // CHECK-NEXT:  ret
   return vset_lane_s32(a, b, 1);
 }
 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
-  // CHECK-LABEL: test_vset_lane_p8:
-  // CHECK-NEXT:  ins.b v0[7], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p8(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_p16:
-  // CHECK-NEXT:  ins.h v0[3], w0
-  // CHECK-NEXT:  ret
   return vset_lane_p16(a, b, 3);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vset_lane_f32:
-  // CHECK-NEXT:  ins.s v1[1], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vset_lane_f32(a, b, 1);
 }
 
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
-  // CHECK-LABEL: test_vset_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[3], [x0]
-  // CHECK-NEXT:  ret
   return vset_lane_f16(*a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s32:
-  // CHECK-NEXT:  ins.s v0[3], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s32(a, b, 3);
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p8:
-  // CHECK-NEXT:  ins.b v0[15], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p8(a, b, 15);
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_p16:
-  // CHECK-NEXT:  ins.h v0[7], w0
-  // CHECK-NEXT:  ret
   return vsetq_lane_p16(a, b, 7);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f32:
-  // CHECK-NEXT:  ins.s v1[3], v0[0]
-  // CHECK-NEXT:  mov.16b  v0, v1
-  // CHECK-NEXT:  ret
   return vsetq_lane_f32(a, b, 3);
 }
 
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 7
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
-  // CHECK-LABEL: test_vsetq_lane_f16:
-  // CHECK-NEXT:  ld1.h { v0 }[7], [x0]
-  // CHECK-NEXT:  ret
   return vsetq_lane_f16(*a, b, 7);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_s64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_s64(a, b, 0);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
-  // CHECK-LABEL: test_vset_lane_u64:
-  // CHECK-NEXT:  fmov d0, x0
-  // CHECK-NEXT:  ret
   return vset_lane_u64(a, b, 0);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_s64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_s64(a, b, 1);
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
-  // CHECK-LABEL: test_vsetq_lane_u64:
-  // CHECK-NEXT:  ins.d v0[1], x0
-  // CHECK-NEXT:  ret
   return vsetq_lane_u64(a, b, 1);
 }
diff --git a/test/CodeGen/aarch64-poly128.c b/test/CodeGen/aarch64-poly128.c
index eebecf74a1567..01c509035f5c7 100644
--- a/test/CodeGen/aarch64-poly128.c
+++ b/test/CodeGen/aarch64-poly128.c
@@ -1,7 +1,7 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly128
 // FIXME: Currently, poly128_t equals to uint128, which will be spilt into
@@ -12,192 +12,238 @@
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define void @test_vstrq_p128(i128* %ptr, i128 %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   store i128 %val, i128* [[TMP1]]
+// CHECK:   ret void
 void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
-  // CHECK-LABEL: test_vstrq_p128
   vstrq_p128(ptr, val);
 
-  // CHECK-ARM64: stp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define i128 @test_vldrq_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   ret i128 [[TMP2]]
 poly128_t test_vldrq_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_vldrq_p128
   return vldrq_p128(ptr);
 
-  // CHECK-ARM64: ldp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
 }
 
+// CHECK-LABEL: define void @test_ld_st_p128(i128* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i128*
+// CHECK:   [[TMP2:%.*]] = load i128, i128* [[TMP1]]
+// CHECK:   [[ADD_PTR:%.*]] = getelementptr inbounds i128, i128* %ptr, i64 1
+// CHECK:   [[TMP3:%.*]] = bitcast i128* [[ADD_PTR]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP3]] to i128*
+// CHECK:   store i128 [[TMP2]], i128* [[TMP4]]
+// CHECK:   ret void
 void test_ld_st_p128(poly128_t * ptr) {
-  // CHECK-LABEL: test_ld_st_p128
    vstrq_p128(ptr+1, vldrq_p128(ptr));
 
- // CHECK-ARM64: ldp [[PLO:x[0-9]+]], [[PHI:x[0-9]+]], [{{x[0-9]+}}]
- // CHECK-ARM64-NEXT: stp [[PLO]], [[PHI]], [{{x[0-9]+}}, #16]
 }
 
+// CHECK-LABEL: define i128 @test_vmull_p64(i64 %a, i64 %b) #0 {
+// CHECK:   [[VMULL_P64_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %a, i64 %b) #2
+// CHECK:   [[VMULL_P641_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I]]
 poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
-  // CHECK-LABEL: test_vmull_p64
   return vmull_p64(a, b);
-  // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
 }
 
+// CHECK-LABEL: define i128 @test_vmull_high_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> [[SHUFFLE_I_I]] to i64
+// CHECK:   [[SHUFFLE_I7_I:%.*]] = shufflevector <2 x i64> %b, <2 x i64> %b, <1 x i32> <i32 1>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> [[SHUFFLE_I7_I]] to i64
+// CHECK:   [[VMULL_P64_I_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.pmull64(i64 [[TMP0]], i64 [[TMP1]]) #2
+// CHECK:   [[VMULL_P641_I_I:%.*]] = bitcast <16 x i8> [[VMULL_P64_I_I]] to i128
+// CHECK:   ret i128 [[VMULL_P641_I_I]]
 poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vmull_high_p64
   return vmull_high_p64(a, b);
-  // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
   return vreinterpretq_p128_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
   return vreinterpretq_p128_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
   return vreinterpretq_p128_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_s64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
   return vreinterpretq_p128_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
   return vreinterpretq_p128_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
   return vreinterpretq_p128_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
   return vreinterpretq_p128_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_u64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
   return vreinterpretq_p128_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f32
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
   return vreinterpretq_p128_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_f64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_f64(<2 x double> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
   return vreinterpretq_p128_f64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p8
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
   return vreinterpretq_p128_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p16
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
   return vreinterpretq_p128_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p128_p64
-// CHECK: ret
+// CHECK-LABEL: define i128 @test_vreinterpretq_p128_p64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to i128
+// CHECK:   ret i128 [[TMP0]]
 poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
   return vreinterpretq_p128_p64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
   return vreinterpretq_s8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
   return vreinterpretq_s16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
   return vreinterpretq_s32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
   return vreinterpretq_s64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
   return vreinterpretq_u8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
   return vreinterpretq_u16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
   return vreinterpretq_u32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
   return vreinterpretq_u64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p128
-// CHECK: ret
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
   return vreinterpretq_f32_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x double> @test_vreinterpretq_f64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x double>
+// CHECK:   ret <2 x double> [[TMP0]]
 float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
   return vreinterpretq_f64_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p128
-// CHECK: ret
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
   return vreinterpretq_p8_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p128
-// CHECK: ret
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
   return vreinterpretq_p16_p128(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p64_p128
-// CHECK: ret
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_p64_p128(i128 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i128 %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
   return vreinterpretq_p64_p128(a);
 }
diff --git a/test/CodeGen/aarch64-poly64.c b/test/CodeGen/aarch64-poly64.c
index 6ea3a2c0d8bd3..762ca94e5e575 100644
--- a/test/CodeGen/aarch64-poly64.c
+++ b/test/CodeGen/aarch64-poly64.c
@@ -1,299 +1,634 @@
-// FIXME: This is a front-end test that depends on LLVM optimizations (-O3). 
-// It should be split into separate files for front/middle/back-end testing.
-
-// REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon \
-// RUN:  -ffp-contract=fast -S -O3 -o - %s | FileCheck %s --check-prefix=CHECK \
-// RUN:  --check-prefix=CHECK-ARM64
+// RUN:  -ffp-contract=fast -emit-llvm -o - %s | opt -S -mem2reg \
+// RUN:  | FileCheck %s
 
 // Test new aarch64 intrinsics with poly64
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <1 x i64> @test_vceq_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <1 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <1 x i1> [[CMP_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[SEXT_I]]
 uint64x1_t test_vceq_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vceq_p64
   return vceq_p64(a, b);
-  // CHECK: cmeq {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vceqq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i64> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[SEXT_I]]
 uint64x2_t test_vceqq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vceqq_p64
   return vceqq_p64(a, b);
-  // CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vtst_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = and <1 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <1 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <1 x i1> [[TMP5]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[VTST_I]]
 uint64x1_t test_vtst_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vtst_p64
   return vtst_p64(a, b);
-  // CHECK: cmtst {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtstq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = and <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i64> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VTST_I]]
 uint64x2_t test_vtstq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtstq_p64
   return vtstq_p64(a, b);
-  // CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vbsl_p64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <1 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <1 x i64> [[VBSL_I]], <i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <1 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <1 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <1 x i64> [[VBSL5_I]]
 poly64x1_t test_vbsl_p64(poly64x1_t a, poly64x1_t b, poly64x1_t c) {
-  // CHECK-LABEL: test_vbsl_p64
   return vbsl_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vbslq_p64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VBSL1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VBSL2_I:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VBSL3_I:%.*]] = and <2 x i64> [[VBSL_I]], [[VBSL1_I]]
+// CHECK:   [[TMP3:%.*]] = xor <2 x i64> [[VBSL_I]], <i64 -1, i64 -1>
+// CHECK:   [[VBSL4_I:%.*]] = and <2 x i64> [[TMP3]], [[VBSL2_I]]
+// CHECK:   [[VBSL5_I:%.*]] = or <2 x i64> [[VBSL3_I]], [[VBSL4_I]]
+// CHECK:   ret <2 x i64> [[VBSL5_I]]
 poly64x2_t test_vbslq_p64(poly64x2_t a, poly64x2_t b, poly64x2_t c) {
-  // CHECK-LABEL: test_vbslq_p64
   return vbslq_p64(a, b, c);
-  // CHECK: bsl {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
 }
 
+// CHECK-LABEL: define i64 @test_vget_lane_p64(<1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 poly64_t test_vget_lane_p64(poly64x1_t v) {
-  // CHECK-LABEL: test_vget_lane_p64
   return vget_lane_p64(v, 0);
-  // CHECK: fmov  {{x[0-9]+}}, {{d[0-9]+}}
 }
 
+// CHECK-LABEL: define i64 @test_vgetq_lane_p64(<2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGETQ_LANE]]
 poly64_t test_vgetq_lane_p64(poly64x2_t v) {
-  // CHECK-LABEL: test_vgetq_lane_p64
   return vgetq_lane_p64(v, 1);
-  // CHECK: {{mov|umov}}  {{x[0-9]+}}, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_p64(i64 %a, <1 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %v to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vset_lane_p64(poly64_t a, poly64x1_t v) {
-  // CHECK-LABEL: test_vset_lane_p64
   return vset_lane_p64(a, v, 0);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_p64(i64 %a, <2 x i64> %v) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %v to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vsetq_lane_p64(poly64_t a, poly64x2_t v) {
-  // CHECK-LABEL: test_vsetq_lane_p64
   return vsetq_lane_p64(a, v, 1);
-  // CHECK: ins  {{v[0-9]+}}.d[1], {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcopy_lane_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 poly64x1_t test_vcopy_lane_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopy_lane_p64
   return vcopy_lane_p64(a, 0, b, 0);
 
-  // CHECK-ARM64: mov v0.16b, v1.16b
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_lane_p64(<2 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGET_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_lane_p64(poly64x2_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vcopyq_lane_p64
   return vcopyq_lane_p64(a, 1, b, 0);
-  // CHECK: zip1 v0.2d, v0.2d, v1.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 poly64x2_t test_vcopyq_laneq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vcopyq_laneq_p64
   return vcopyq_laneq_p64(a, 1, b, 1);
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vcreate_p64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 poly64x1_t test_vcreate_p64(uint64_t a) {
-  // CHECK-LABEL: test_vcreate_p64
   return vcreate_p64(a);
-  // CHECK: fmov  {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vdup_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdup_n_p64
   return vdup_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vdupq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vdupq_n_p64
   return vdupq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VECINIT_I]]
 poly64x1_t test_vmov_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmov_n_p64
   return vmov_n_p64(a);
-  // CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_p64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 poly64x2_t test_vmovq_n_p64(poly64_t a) {
-  // CHECK-LABEL: test_vmovq_n_p64
   return vmovq_n_p64(a);
-  // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}}
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 poly64x1_t test_vdup_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdup_lane_p64
   return vdup_lane_p64(vec, 0);
-  // CHECK: ret
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) {
-  // CHECK-LABEL: test_vdupq_lane_p64
   return vdupq_lane_p64(vec, 0);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) {
-  // CHECK-LABEL: test_vdupq_laneq_p64
   return vdupq_laneq_p64(vec, 1);
-  // CHECK: dup {{v[0-9]+}}.2d, {{v[0-9]+}}.d[1]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcombine_p64(<1 x i64> %low, <1 x i64> %high) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %low, <1 x i64> %high, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vcombine_p64(poly64x1_t low, poly64x1_t high) {
-  // CHECK-LABEL: test_vcombine_p64
   return vcombine_p64(low, high);
-  // CHECK: ins {{v[0-9]+}}.d[1], {{v[0-9]+}}.d[0]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vld1_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <1 x i64>, <1 x i64>* [[TMP1]]
+// CHECK:   ret <1 x i64> [[TMP2]]
 poly64x1_t test_vld1_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1_p64
   return vld1_p64(ptr);
-  // CHECK-ARM64: ldr {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vld1q_p64(i64* %ptr) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]]
+// CHECK:   ret <2 x i64> [[TMP2]]
 poly64x2_t test_vld1q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld1q_p64
   return vld1q_p64(ptr);
-  // CHECK-ARM64: ldr {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1_p64(i64* %ptr, <1 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %val to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <1 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   store <1 x i64> [[TMP3]], <1 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1_p64(poly64_t * ptr, poly64x1_t val) {
-  // CHECK-LABEL: test_vst1_p64
   return vst1_p64(ptr, val);
-  // CHECK-ARM64: str {{d[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst1q_p64(i64* %ptr, <2 x i64> %val) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %val to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to <2 x i64>*
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   store <2 x i64> [[TMP3]], <2 x i64>* [[TMP2]]
+// CHECK:   ret void
 void test_vst1q_p64(poly64_t * ptr, poly64x2_t val) {
-  // CHECK-LABEL: test_vst1q_p64
   return vst1q_p64(ptr, val);
-  // CHECK-ARM64: str {{q[0-9]+}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x2_t @test_vld2_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld2.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2]], { <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x2_t [[TMP6]]
 poly64x1x2_t test_vld2_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2_p64
   return vld2_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x2_t @test_vld2q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64> } [[VLD2]], { <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x2_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x2_t [[TMP6]]
 poly64x2x2_t test_vld2q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld2q_p64
   return vld2q_p64(ptr);
-  // CHECK: ld2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x3_t @test_vld3_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld3.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x3_t [[TMP6]]
 poly64x1x3_t test_vld3_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3_p64
   return vld3_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x3_t @test_vld3q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD3:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld3.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64> } [[VLD3]], { <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x3_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x3_t [[TMP6]]
 poly64x2x3_t test_vld3q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld3q_p64
   return vld3q_p64(ptr);
-  // CHECK: ld3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x1x4_t @test_vld4_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <1 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.aarch64.neon.ld4.v1i64.p0v1i64(<1 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x1x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[RETVAL]], align 8
+// CHECK:   ret %struct.poly64x1x4_t [[TMP6]]
 poly64x1x4_t test_vld4_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4_p64
   return vld4_p64(ptr);
-  // CHECK: ld1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define %struct.poly64x2x4_t @test_vld4q_p64(i64* %ptr) #0 {
+// CHECK:   [[RETVAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <2 x i64>*
+// CHECK:   [[VLD4:%.*]] = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld4.v2i64.p0v2i64(<2 x i64>* [[TMP2]])
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }*
+// CHECK:   store { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } [[VLD4]], { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* [[TMP3]]
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly64x2x4_t* [[RETVAL]] to i8*
+// CHECK:   [[TMP5:%.*]] = bitcast %struct.poly64x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP4]], i8* [[TMP5]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP6:%.*]] = load %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[RETVAL]], align 16
+// CHECK:   ret %struct.poly64x2x4_t [[TMP6]]
 poly64x2x4_t test_vld4q_p64(poly64_t const * ptr) {
-  // CHECK-LABEL: test_vld4q_p64
   return vld4q_p64(ptr);
-  // CHECK: ld4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2_p64(i64* %ptr, [2 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <1 x i64>] [[VAL]].coerce, [2 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 16, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x2_t, %struct.poly64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v1i64.p0i8(<1 x i64> [[TMP7]], <1 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2_p64(poly64_t * ptr, poly64x1x2_t val) {
-  // CHECK-LABEL: test_vst2_p64
   return vst2_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst2q_p64(i64* %ptr, [2 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [2 x <2 x i64>] [[VAL]].coerce, [2 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x2_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x2_t, %struct.poly64x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [2 x <2 x i64>], [2 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[TMP7:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st2.v2i64.p0i8(<2 x i64> [[TMP7]], <2 x i64> [[TMP8]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst2q_p64(poly64_t * ptr, poly64x2x2_t val) {
-  // CHECK-LABEL: test_vst2q_p64
   return vst2q_p64(ptr, val);
-  // CHECK:  st2 {{{ *v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3_p64(i64* %ptr, [3 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <1 x i64>] [[VAL]].coerce, [3 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 24, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x3_t, %struct.poly64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v1i64.p0i8(<1 x i64> [[TMP9]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3_p64(poly64_t * ptr, poly64x1x3_t val) {
-  // CHECK-LABEL: test_vst3_p64
   return vst3_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst3q_p64(i64* %ptr, [3 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [3 x <2 x i64>] [[VAL]].coerce, [3 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x3_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 48, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x3_t, %struct.poly64x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [3 x <2 x i64>], [3 x <2 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st3.v2i64.p0i8(<2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst3q_p64(poly64_t * ptr, poly64x2x3_t val) {
-  // CHECK-LABEL: test_vst3q_p64
   return vst3q_p64(ptr, val);
-  // CHECK:  st3 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4_p64(i64* %ptr, [4 x <1 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <1 x i64>] [[VAL]].coerce, [4 x <1 x i64>]* [[COERCE_DIVE]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x1x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 32, i32 8, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to <8 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX3]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX5]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <1 x i64> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x1x4_t, %struct.poly64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL6]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX7]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <1 x i64> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP4]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP6]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP8]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP10]] to <1 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v1i64.p0i8(<1 x i64> [[TMP11]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4_p64(poly64_t * ptr, poly64x1x4_t val) {
-  // CHECK-LABEL: test_vst4_p64
   return vst4_p64(ptr, val);
-  // CHECK:  st1 {{{ *v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d, v[0-9]+.1d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define void @test_vst4q_p64(i64* %ptr, [4 x <2 x i64>] %val.coerce) #0 {
+// CHECK:   [[VAL:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly64x2x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[VAL]], i32 0, i32 0
+// CHECK:   store [4 x <2 x i64>] [[VAL]].coerce, [4 x <2 x i64>]* [[COERCE_DIVE]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly64x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly64x2x4_t* [[VAL]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[TMP0]], i8* [[TMP1]], i64 64, i32 16, i1 false)
+// CHECK:   [[TMP2:%.*]] = bitcast i64* %ptr to i8*
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL1]], i64 0, i64 0
+// CHECK:   [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = bitcast <2 x i64> [[TMP3]] to <16 x i8>
+// CHECK:   [[VAL2:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL2]], i64 0, i64 1
+// CHECK:   [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX3]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i64> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL4:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL4]], i64 0, i64 2
+// CHECK:   [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX5]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i64> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL6:%.*]] = getelementptr inbounds %struct.poly64x2x4_t, %struct.poly64x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* [[VAL6]], i64 0, i64 3
+// CHECK:   [[TMP9:%.*]] = load <2 x i64>, <2 x i64>* [[ARRAYIDX7]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP4]] to <2 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP6]] to <2 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP8]] to <2 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP10]] to <2 x i64>
+// CHECK:   call void @llvm.aarch64.neon.st4.v2i64.p0i8(<2 x i64> [[TMP11]], <2 x i64> [[TMP12]], <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], i8* [[TMP2]])
+// CHECK:   ret void
 void test_vst4q_p64(poly64_t * ptr, poly64x2x4_t val) {
-  // CHECK-LABEL: test_vst4q_p64
   return vst4q_p64(ptr, val);
-  // CHECK:  st4 {{{ *v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d *}}}, [{{x[0-9]+|sp}}]
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vext_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 poly64x1_t test_vext_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vext_p64
   return vext_u64(a, b, 0);
 
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vextq_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vextq_p64
   return vextq_p64(a, b, 1);
-  // CHECK: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{#0x8|#8}}
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip1q_p64
   return vzip1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vzip2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vzip2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vzip2q_p64
   return vzip2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp1q_p64
   return vuzp1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vuzp2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vuzp2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vuzp2q_p64
   return vuzp2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn1q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 0, i32 2>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn1q_p64
   return vtrn1q_p64(a, b);
-  // CHECK-ARM64: zip1 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vtrn2q_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 3>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vtrn2q_p64
   return vtrn2q_u64(a, b);
-  // CHECK-ARM64: zip2 {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
 }
 
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_p64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <1 x i64> @llvm.aarch64.neon.vsri.v1i64(<1 x i64> [[VSRI_N]], <1 x i64> [[VSRI_N1]], i32 33)
+// CHECK:   ret <1 x i64> [[VSRI_N2]]
 poly64x1_t test_vsri_n_p64(poly64x1_t a, poly64x1_t b) {
-  // CHECK-LABEL: test_vsri_n_p64
   return vsri_n_p64(a, b, 33);
-  // CHECK: sri {{d[0-9]+}}, {{d[0-9]+}}, #33
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_p64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSRI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSRI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRI_N2:%.*]] = call <2 x i64> @llvm.aarch64.neon.vsri.v2i64(<2 x i64> [[VSRI_N]], <2 x i64> [[VSRI_N1]], i32 64)
+// CHECK:   ret <2 x i64> [[VSRI_N2]]
 poly64x2_t test_vsriq_n_p64(poly64x2_t a, poly64x2_t b) {
-  // CHECK-LABEL: test_vsriq_n_p64
   return vsriq_n_p64(a, b, 64);
-  // CHECK: sri {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #64
 }
 
diff --git a/test/CodeGen/aarch64-type-sizes.c b/test/CodeGen/aarch64-type-sizes.c
index 3ff8c4f0d4d03..ce8b51fc40856 100644
--- a/test/CodeGen/aarch64-type-sizes.c
+++ b/test/CodeGen/aarch64-type-sizes.c
@@ -1,8 +1,7 @@
-// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
+// RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck --check-prefix=CHECK %s
 // char by definition has size 1
 
-// CHECK-LE: target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-// CHECK-BE: target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
+// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 int check_short() {
   return sizeof(short);
@@ -89,4 +88,3 @@ int foo() {
   return sizeof(enum Small);
 // CHECK: ret i32 4
 }
-
diff --git a/test/CodeGen/adc-builtins.c b/test/CodeGen/adc-builtins.c
index 5e5890595fb0d..0d8d6fa03476f 100644
--- a/test/CodeGen/adc-builtins.c
+++ b/test/CodeGen/adc-builtins.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-feature +adx -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
 
 #define __MM_MALLOC_H
 
diff --git a/test/CodeGen/alias.c b/test/CodeGen/alias.c
index a14bc0ecd5534..c34dcf5ca229c 100644
--- a/test/CodeGen/alias.c
+++ b/test/CodeGen/alias.c
@@ -24,20 +24,20 @@ extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids")))
 // CHECKBASIC-DAG: @__mod_usb_device_table = alias i32, getelementptr inbounds ([8 x i32], [8 x i32]* @wacom_usb_ids, i32 0, i32 0)
 // CHECKASM-DAG: .globl __mod_usb_device_table
 // CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids
-// CHECKASM-DAG-NOT: .size __mod_usb_device_table
+// CHECKASM-NOT: .size __mod_usb_device_table
 
 extern int g1;
 extern int g1 __attribute((alias("g0")));
 // CHECKBASIC-DAG: @g1 = alias i32, i32* @g0
 // CHECKASM-DAG: .globl g1
 // CHECKASM-DAG: g1 = g0
-// CHECKASM-DAG-NOT: .size g1
+// CHECKASM-NOT: .size g1
 
 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS")));
 // CHECKBASIC-DAG: @__libc_errno = thread_local alias i32, i32* @TL_WITH_ALIAS
 // CHECKASM-DAG: .globl __libc_errno
 // CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS
-// CHECKASM-DAG-NOT: .size __libc_errno
+// CHECKASM-NOT: .size __libc_errno
 
 void f0(void) { }
 extern void f1(void);
diff --git a/test/CodeGen/align_value.cpp b/test/CodeGen/align_value.cpp
index 6d0e48128cb2f..1601e3dcc570b 100644
--- a/test/CodeGen/align_value.cpp
+++ b/test/CodeGen/align_value.cpp
@@ -4,7 +4,7 @@ typedef double * __attribute__((align_value(64))) aligned_double;
 
 void foo(aligned_double x, double * y __attribute__((align_value(32))),
          double & z __attribute__((align_value(128)))) { };
-// CHECK: define void @_Z3fooPdS_Rd(double* align 64 %x, double* align 32 %y, double* dereferenceable(8) align 128 %z)
+// CHECK: define void @_Z3fooPdS_Rd(double* align 64 %x, double* align 32 %y, double* align 128 dereferenceable(8) %z)
 
 struct ad_struct {
   aligned_double a;
diff --git a/test/CodeGen/arm-bitfield-alignment.c b/test/CodeGen/arm-bitfield-alignment.c
index 66bbdae57bbc3..1c453b2e2d6ab 100644
--- a/test/CodeGen/arm-bitfield-alignment.c
+++ b/test/CodeGen/arm-bitfield-alignment.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - -O3 %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm-none-eabi -ffreestanding -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -ffreestanding -emit-llvm -o - %s | FileCheck %s
 
 extern struct T {
   int b0 : 8;
diff --git a/test/CodeGen/arm-cc.c b/test/CodeGen/arm-cc.c
index 8e6aae78b66de..b506834d1506a 100644
--- a/test/CodeGen/arm-cc.c
+++ b/test/CodeGen/arm-cc.c
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -triple armv7-apple-darwin9 -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=DARWIN-AAPCS %s
 // RUN: %clang_cc1 -triple arm-none-linux-gnueabi -target-abi apcs-gnu -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-APCS %s
 // RUN: %clang_cc1 -triple arm-none-linux-gnueabi -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-AAPCS %s
+// RUN: %clang_cc1 -triple arm-none-linux-musleabi -target-abi apcs-gnu -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-APCS %s
+// RUN: %clang_cc1 -triple arm-none-linux-musleabi -target-abi aapcs  -emit-llvm -w -o - %s | FileCheck -check-prefix=LINUX-AAPCS %s
+// RUN: %clang_cc1 -triple armv7-none-eabihf -target-abi aapcs-vfp -emit-llvm -w -o - %s | FileCheck -check-prefix=BAREMETAL-AAPCS_VFP %s
 
 
 // DARWIN-APCS-LABEL: define void @f()
@@ -13,6 +16,9 @@
 // LINUX-APCS: call arm_apcscc void @g
 // LINUX-AAPCS-LABEL: define void @f()
 // LINUX-AAPCS: call void @g
+// BAREMETAL-AAPCS_VFP-LABEL: define void @f()
+// BAREMETAL-AAPCS_VFP: call void @g
+// BAREMETAL-AAPCS_VFP: declare void @g()
 void g(void);
 void f(void) {
   g();
diff --git a/test/CodeGen/arm-crc32.c b/test/CodeGen/arm-crc32.c
index d49f20eac74cb..8a70d8c78a958 100644
--- a/test/CodeGen/arm-crc32.c
+++ b/test/CodeGen/arm-crc32.c
@@ -1,6 +1,5 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple armv8-none-linux-gnueabi \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
@@ -48,7 +47,7 @@ int crc32d(int a, long long b)
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32w(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32w(i32 [[T3]], i32 [[T2]])
 }
 
@@ -58,6 +57,6 @@ int crc32cd(int a, long long b)
 // CHECK: [[T0:%[0-9]+]] = trunc i64 %b to i32
 // CHECK: [[T1:%[0-9]+]] = lshr i64 %b, 32
 // CHECK: [[T2:%[0-9]+]] = trunc i64 [[T1]] to i32
-// CHECK: [[T3:%[0-9]+]] = tail call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
+// CHECK: [[T3:%[0-9]+]] = call i32 @llvm.arm.crc32cw(i32 %a, i32 [[T0]])
 // CHECK: call i32 @llvm.arm.crc32cw(i32 [[T3]], i32 [[T2]])
 }
diff --git a/test/CodeGen/arm-eabi.c b/test/CodeGen/arm-eabi.c
index 0dc04f51d21f3..3a651feafbccd 100644
--- a/test/CodeGen/arm-eabi.c
+++ b/test/CodeGen/arm-eabi.c
@@ -7,6 +7,14 @@
 // RUN: %clang -target arm-none-gnueabi -S -meabi 5 -o - %s | FileCheck -check-prefix=CHECK-EABI %s
 // RUN: %clang -target arm-none-gnueabihf -S -o - %s | FileCheck -check-prefix=CHECK-GNUEABI %s
 // RUN: %clang -target arm-none-gnueabihf -S -meabi 5 -o - %s | FileCheck -check-prefix=CHECK-EABI %s
+// RUN: %clang -target arm-none-musleabi -S -o - %s \
+// RUN:   | FileCheck -check-prefix=CHECK-GNUEABI %s
+// RUN: %clang -target arm-none-musleabi -S -o - %s -meabi 5 \
+// RUN:   | FileCheck -check-prefix=CHECK-EABI %s
+// RUN: %clang -target arm-none-musleabihf -S -o - %s \
+// RUN:   | FileCheck -check-prefix=CHECK-GNUEABI %s
+// RUN: %clang -target arm-none-musleabihf -S -o - %s -meabi 5 \
+// RUN:   | FileCheck -check-prefix=CHECK-EABI %s
 
 struct my_s {
   unsigned long a[18];
diff --git a/test/CodeGen/arm-fp16-arguments.c b/test/CodeGen/arm-fp16-arguments.c
index 15a9ceb94cfbb..65f076ac3ca87 100644
--- a/test/CodeGen/arm-fp16-arguments.c
+++ b/test/CodeGen/arm-fp16-arguments.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=SOFT
 // RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi hard -fallow-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=CHECK --check-prefix=HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs -mfloat-abi soft -fnative-half-arguments-and-returns -emit-llvm -o - -O1 %s | FileCheck %s --check-prefix=NATIVE
 
 __fp16 g;
 
@@ -10,12 +11,17 @@ void t1(__fp16 a) { g = a; }
 // HARD: [[BITCAST:%.*]] = bitcast float [[PARAM]] to i32
 // HARD: [[TRUNC:%.*]] = trunc i32 [[BITCAST]] to i16
 // CHECK: store i16 [[TRUNC]], i16* bitcast (half* @g to i16*)
+// NATIVE: define void @t1(half [[PARAM:%.*]])
+// NATIVE: store half [[PARAM]], half* @g
 
 __fp16 t2() { return g; }
 // SOFT: define i32 @t2()
 // HARD: define arm_aapcs_vfpcc float @t2()
+// NATIVE: define half @t2()
 // CHECK: [[LOAD:%.*]] = load i16, i16* bitcast (half* @g to i16*)
 // CHECK: [[ZEXT:%.*]] = zext i16 [[LOAD]] to i32
 // SOFT: ret i32 [[ZEXT]]
 // HARD: [[BITCAST:%.*]] = bitcast i32 [[ZEXT]] to float
 // HARD: ret float [[BITCAST]]
+// NATIVE: [[LOAD:%.*]] = load half, half* @g
+// NATIVE: ret half [[LOAD]]
diff --git a/test/CodeGen/arm-neon-directed-rounding.c b/test/CodeGen/arm-neon-directed-rounding.c
index 84029318865ce..3625e63b17a0a 100644
--- a/test/CodeGen/arm-neon-directed-rounding.c
+++ b/test/CodeGen/arm-neon-directed-rounding.c
@@ -1,75 +1,135 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vrnda_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDA_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDA_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> [[VRNDA_V_I]]) #2
+// CHECK:   [[VRNDA_V2_I:%.*]] = bitcast <2 x float> [[VRNDA_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDA_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnda_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnda_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrinta.v2f32(<2 x float> %a)
   return vrnda_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndaq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDAQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDAQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> [[VRNDAQ_V_I]]) #2
+// CHECK:   [[VRNDAQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDAQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDAQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndaq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndaq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrinta.v4f32(<4 x float> %a)
   return vrndaq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndm_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDM_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> [[VRNDM_V_I]]) #2
+// CHECK:   [[VRNDM_V2_I:%.*]] = bitcast <2 x float> [[VRNDM_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDM_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndm_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintm.v2f32(<2 x float> %a)
   return vrndm_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndmq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDMQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> [[VRNDMQ_V_I]]) #2
+// CHECK:   [[VRNDMQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDMQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDMQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndmq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintm.v4f32(<4 x float> %a)
   return vrndmq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndn_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDN_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> [[VRNDN_V_I]]) #2
+// CHECK:   [[VRNDN_V2_I:%.*]] = bitcast <2 x float> [[VRNDN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDN_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndn_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndn_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintn.v2f32(<2 x float> %a)
   return vrndn_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndnq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDNQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDNQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> [[VRNDNQ_V_I]]) #2
+// CHECK:   [[VRNDNQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDNQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDNQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndnq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndnq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintn.v4f32(<4 x float> %a)
   return vrndnq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndp_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDP_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDP_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> [[VRNDP_V_I]]) #2
+// CHECK:   [[VRNDP_V2_I:%.*]] = bitcast <2 x float> [[VRNDP_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDP_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndp_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndp_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintp.v2f32(<2 x float> %a)
   return vrndp_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndpq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDPQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDPQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> [[VRNDPQ_V_I]]) #2
+// CHECK:   [[VRNDPQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDPQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDPQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndpq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndpq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintp.v4f32(<4 x float> %a)
   return vrndpq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrndx_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRNDX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRNDX_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> [[VRNDX_V_I]]) #2
+// CHECK:   [[VRNDX_V2_I:%.*]] = bitcast <2 x float> [[VRNDX_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRNDX_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrndx_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrndx_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintx.v2f32(<2 x float> %a)
   return vrndx_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndxq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDXQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> [[VRNDXQ_V_I]]) #2
+// CHECK:   [[VRNDXQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDXQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDXQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndxq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndxq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintx.v4f32(<4 x float> %a)
   return vrndxq_f32(a);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vrnd_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRND_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRND_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> [[VRND_V_I]]) #2
+// CHECK:   [[VRND_V2_I:%.*]] = bitcast <2 x float> [[VRND_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VRND_V2_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP1]]
 float32x2_t test_vrnd_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vrnd_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vrintz.v2f32(<2 x float> %a)
   return vrnd_f32(a);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vrndq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRNDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRNDQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> [[VRNDQ_V_I]]) #2
+// CHECK:   [[VRNDQ_V2_I:%.*]] = bitcast <4 x float> [[VRNDQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VRNDQ_V2_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vrndq_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vrndq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vrintz.v4f32(<4 x float> %a)
   return vrndq_f32(a);
 }
diff --git a/test/CodeGen/arm-neon-fma.c b/test/CodeGen/arm-neon-fma.c
index 994702d7469a6..ff6acbcc2e917 100644
--- a/test/CodeGen/arm-neon-fma.c
+++ b/test/CodeGen/arm-neon-fma.c
@@ -1,19 +1,34 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-none-linux-gnueabihf \
 // RUN:   -target-abi aapcs \
-// RUN:   -target-cpu cortex-a8 \
+// RUN:   -target-cpu cortex-a7 \
 // RUN:   -mfloat-abi hard \
 // RUN:   -ffreestanding \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_fma_order(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %accum to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %lhs to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %rhs to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #2
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_fma_order(float32x2_t accum, float32x2_t lhs, float32x2_t rhs) {
   return vfma_f32(accum, lhs, rhs);
-// CHECK: call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %rhs, <2 x float> %accum)
 }
 
+// CHECK-LABEL: define <4 x float> @test_fmaq_order(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %accum to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %lhs to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %rhs to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #2
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_fmaq_order(float32x4_t accum, float32x4_t lhs, float32x4_t rhs) {
   return vfmaq_f32(accum, lhs, rhs);
-// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %rhs, <4 x float> %accum)
 }
diff --git a/test/CodeGen/arm-neon-numeric-maxmin.c b/test/CodeGen/arm-neon-numeric-maxmin.c
index 615a854b5e213..6e385b9c49efa 100644
--- a/test/CodeGen/arm-neon-numeric-maxmin.c
+++ b/test/CodeGen/arm-neon-numeric-maxmin.c
@@ -1,27 +1,55 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x float> @test_vmaxnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAXNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAXNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAXNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> [[VMAXNM_V_I]], <2 x float> [[VMAXNM_V1_I]]) #2
+// CHECK:   [[VMAXNM_V3_I:%.*]] = bitcast <2 x float> [[VMAXNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAXNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vmaxnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vmaxnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vmaxnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vmaxnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> [[VMAXNMQ_V_I]], <4 x float> [[VMAXNMQ_V1_I]]) #2
+// CHECK:   [[VMAXNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vmaxnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vmaxnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vmaxnmq_f32(a, b);
 }
 
+// CHECK-LABEL: define <2 x float> @test_vminnm_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMINNM_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMINNM_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMINNM_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> [[VMINNM_V_I]], <2 x float> [[VMINNM_V1_I]]) #2
+// CHECK:   [[VMINNM_V3_I:%.*]] = bitcast <2 x float> [[VMINNM_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMINNM_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vminnm_f32(float32x2_t a, float32x2_t b) {
-  // CHECK-LABEL: test_vminnm_f32
-  // CHECK: call <2 x float> @llvm.arm.neon.vminnm.v2f32(<2 x float> %a, <2 x float> %b)
   return vminnm_f32(a, b);
 }
 
+// CHECK-LABEL: define <4 x float> @test_vminnmq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINNMQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINNMQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINNMQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> [[VMINNMQ_V_I]], <4 x float> [[VMINNMQ_V1_I]]) #2
+// CHECK:   [[VMINNMQ_V3_I:%.*]] = bitcast <4 x float> [[VMINNMQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINNMQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vminnmq_f32(float32x4_t a, float32x4_t b) {
-  // CHECK-LABEL: test_vminnmq_f32
-  // CHECK: call <4 x float> @llvm.arm.neon.vminnm.v4f32(<4 x float> %a, <4 x float> %b)
   return vminnmq_f32(a, b);
 }
diff --git a/test/CodeGen/arm-neon-shifts.c b/test/CodeGen/arm-neon-shifts.c
index 7acfb894e9768..ebaa97fe38843 100644
--- a/test/CodeGen/arm-neon-shifts.c
+++ b/test/CodeGen/arm-neon-shifts.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
@@ -27,19 +27,20 @@ uint8x8_t test_shift_vshr_umax(uint8x8_t a) {
 uint8x8_t test_shift_vsra(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra
   // CHECK: %[[SHR:.*]] = lshr <8 x i8> %b, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_u8(a, b, 5);
 }
 
 int8x8_t test_shift_vsra_smax(int8x8_t a, int8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_smax
   // CHECK: %[[SHR:.*]] = ashr <8 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
-  // CHECK: %{{.*}} = add <8 x i8> %[[SHR]], %a
+  // CHECK: %{{.*}} = add <8 x i8> %a, %[[SHR]]
   return vsra_n_s8(a, b, 8);
 }
 
 uint8x8_t test_shift_vsra_umax(uint8x8_t a, uint8x8_t b) {
   // CHECK-LABEL: test_shift_vsra_umax
-  // CHECK: ret <8 x i8> %a
+  // CHECK: [[RES:%.*]] = add <8 x i8> %a, zeroinitializer
+  // CHECK: ret <8 x i8> [[RES]]
   return vsra_n_u8(a, b, 8);
 }
diff --git a/test/CodeGen/arm-neon-vcvtX.c b/test/CodeGen/arm-neon-vcvtX.c
index ff8ce7ea3e2a3..20cd97c858cb1 100644
--- a/test/CodeGen/arm-neon-vcvtX.c
+++ b/test/CodeGen/arm-neon-vcvtX.c
@@ -1,99 +1,147 @@
-// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -O1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -ffreestanding -emit-llvm %s -o - | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> [[VCVTA_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_S32_V1_I]]
 int32x2_t test_vcvta_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtas.v2i32.v2f32(<2 x float> %a)
   return vcvta_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvta_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTA_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTA_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> [[VCVTA_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTA_U32_V1_I]]
 uint32x2_t test_vcvta_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvta_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtau.v2i32.v2f32(<2 x float> %a)
   return vcvta_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> [[VCVTAQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_S32_V1_I]]
 int32x4_t test_vcvtaq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtas.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtaq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTAQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTAQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> [[VCVTAQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTAQ_U32_V1_I]]
 uint32x4_t test_vcvtaq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtaq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtau.v4i32.v4f32(<4 x float> %a)
   return vcvtaq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> [[VCVTN_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_S32_V1_I]]
 int32x2_t test_vcvtn_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtns.v2i32.v2f32(<2 x float> %a)
   return vcvtn_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtn_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTN_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTN_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> [[VCVTN_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTN_U32_V1_I]]
 uint32x2_t test_vcvtn_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtn_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtnu.v2i32.v2f32(<2 x float> %a)
   return vcvtn_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> [[VCVTNQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_S32_V1_I]]
 int32x4_t test_vcvtnq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtns.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtnq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTNQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTNQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> [[VCVTNQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTNQ_U32_V1_I]]
 uint32x4_t test_vcvtnq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtnq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtnu.v4i32.v4f32(<4 x float> %a)
   return vcvtnq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> [[VCVTP_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_S32_V1_I]]
 int32x2_t test_vcvtp_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtps.v2i32.v2f32(<2 x float> %a)
   return vcvtp_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtp_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTP_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTP_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> [[VCVTP_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTP_U32_V1_I]]
 uint32x2_t test_vcvtp_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtp_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtpu.v2i32.v2f32(<2 x float> %a)
   return vcvtp_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> [[VCVTPQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_S32_V1_I]]
 int32x4_t test_vcvtpq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtps.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtpq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTPQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTPQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> [[VCVTPQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTPQ_U32_V1_I]]
 uint32x4_t test_vcvtpq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtpq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtpu.v4i32.v4f32(<4 x float> %a)
   return vcvtpq_u32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_S32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_S32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> [[VCVTM_S32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_S32_V1_I]]
 int32x2_t test_vcvtm_s32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_s32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtms.v2i32.v2f32(<2 x float> %a)
   return vcvtm_s32_f32(a);
 }
 
+// CHECK-LABEL: define <2 x i32> @test_vcvtm_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVTM_U32_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVTM_U32_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> [[VCVTM_U32_V_I]]) #2
+// CHECK:   ret <2 x i32> [[VCVTM_U32_V1_I]]
 uint32x2_t test_vcvtm_u32_f32(float32x2_t a) {
-  // CHECK-LABEL: test_vcvtm_u32_f32
-  // CHECK-LABEL: call <2 x i32> @llvm.arm.neon.vcvtmu.v2i32.v2f32(<2 x float> %a)
   return vcvtm_u32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_S32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_S32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> [[VCVTMQ_S32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_S32_V1_I]]
 int32x4_t test_vcvtmq_s32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_s32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtms.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_s32_f32(a);
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcvtmq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVTMQ_U32_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVTMQ_U32_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> [[VCVTMQ_U32_V_I]]) #2
+// CHECK:   ret <4 x i32> [[VCVTMQ_U32_V1_I]]
 uint32x4_t test_vcvtmq_u32_f32(float32x4_t a) {
-  // CHECK-LABEL: test_vcvtmq_u32_f32
-  // CHECK-LABEL: call <4 x i32> @llvm.arm.neon.vcvtmu.v4i32.v4f32(<4 x float> %a)
   return vcvtmq_u32_f32(a);
 }
diff --git a/test/CodeGen/arm-neon-vget.c b/test/CodeGen/arm-neon-vget.c
index 4a710a2ad8e4e..3bf8905f395f1 100644
--- a/test/CodeGen/arm-neon-vget.c
+++ b/test/CodeGen/arm-neon-vget.c
@@ -1,124 +1,123 @@
-// REQUIRES: arm-registered-target
 // RUN: %clang_cc1 -triple thumbv7-apple-darwin \
 // RUN:   -target-abi apcs-gnu \
 // RUN:   -target-cpu cortex-a8 \
 // RUN:   -mfloat-abi soft \
 // RUN:   -target-feature +soft-float-abi \
 // RUN:   -ffreestanding \
-// RUN:   -emit-llvm -w -O1 -o - %s | FileCheck %s
+// RUN:   -emit-llvm -w -o - %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // Check that the vget_low/vget_high intrinsics generate a single shuffle
 // without any bitcasting.
 int8x8_t low_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_s8(a);
 }
 
 uint8x8_t low_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_u8(a);
 }
 
 int16x4_t low_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_s16(a);
 }
 
 uint16x4_t low_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_u16(a);
 }
 
 int32x2_t low_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_s32(a);
 }
 
 uint32x2_t low_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_u32(a);
 }
 
 int64x1_t low_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_s64(a);
 }
 
 uint64x1_t low_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
   return vget_low_u64(a);
 }
 
 poly8x8_t low_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return vget_low_p8(a);
 }
 
 poly16x4_t low_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   return vget_low_p16(a);
 }
 
 float32x2_t low_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
   return vget_low_f32(a);
 }
 
 
 int8x8_t high_s8(int8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_s8(a);
 }
 
 uint8x8_t high_u8 (uint8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_u8(a);
 }
 
 int16x4_t high_s16( int16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_s16(a);
 }
 
 uint16x4_t high_u16(uint16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_u16(a);
 }
 
 int32x2_t high_s32( int32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_s32(a);
 }
 
 uint32x2_t high_u32(uint32x4_t a) {
-// CHECK: shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_u32(a);
 }
 
 int64x1_t high_s64( int64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_s64(a);
 }
 
 uint64x1_t high_u64(uint64x2_t a) {
-// CHECK: shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> <i32 1>
+// CHECK: shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
   return vget_high_u64(a);
 }
 
 poly8x8_t high_p8 (poly8x16_t a) {
-// CHECK: shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK: shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   return vget_high_p8(a);
 }
 
 poly16x4_t high_p16(poly16x8_t a) {
-// CHECK: shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK: shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   return vget_high_p16(a);
 }
 
 float32x2_t high_f32(float32x4_t a) {
-// CHECK: shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+// CHECK: shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
   return vget_high_f32(a);
 }
 
diff --git a/test/CodeGen/arm-swiftcall.c b/test/CodeGen/arm-swiftcall.c
new file mode 100644
index 0000000000000..d54a31337085f
--- /dev/null
+++ b/test/CodeGen/arm-swiftcall.c
@@ -0,0 +1,504 @@
+// RUN: %clang_cc1 -triple armv7-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
+
+// This isn't really testing anything ARM-specific; it's just a convenient
+// 32-bit platform.
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define OUT __attribute__((swift_indirect_result))
+#define ERROR __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+/*****************************************************************************/
+/****************************** PARAMETER ABIS *******************************/
+/*****************************************************************************/
+
+SWIFTCALL void indirect_result_1(OUT int *arg0, OUT float *arg1) {}
+// CHECK-LABEL: define {{.*}} void @indirect_result_1(i32* noalias sret align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+// TODO: maybe this shouldn't suppress sret.
+SWIFTCALL int indirect_result_2(OUT int *arg0, OUT float *arg1) {  __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} i32 @indirect_result_2(i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+typedef struct { char array[1024]; } struct_reallybig;
+SWIFTCALL struct_reallybig indirect_result_3(OUT int *arg0, OUT float *arg1) { __builtin_unreachable(); }
+// CHECK-LABEL: define {{.*}} void @indirect_result_3({{.*}}* noalias sret {{.*}}, i32* noalias align 4 dereferenceable(4){{.*}}, float* noalias align 4 dereferenceable(4){{.*}})
+
+SWIFTCALL void context_1(CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_1(i8* swiftself
+
+SWIFTCALL void context_2(void *arg0, CONTEXT void *self) {}
+// CHECK-LABEL: define {{.*}} void @context_2(i8*{{.*}}, i8* swiftself
+
+SWIFTCALL void context_error_1(CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_1(i32* swiftself{{.*}}, float** swifterror)
+// CHECK:       [[TEMP:%.*]] = alloca float*, align 4
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERRORARG:%.*]], align 4
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 4
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 4
+// CHECK:       store float* [[T0]], float** [[ERRORARG]], align 4
+void test_context_error_1() {
+  int x;
+  float *error;
+  context_error_1(&x, &error);
+}
+// CHECK-LABEL: define void @test_context_error_1()
+// CHECK:       [[X:%.*]] = alloca i32, align 4
+// CHECK:       [[ERROR:%.*]] = alloca float*, align 4
+// CHECK:       [[TEMP:%.*]] = alloca swifterror float*, align 4
+// CHECK:       [[T0:%.*]] = load float*, float** [[ERROR]], align 4
+// CHECK:       store float* [[T0]], float** [[TEMP]], align 4
+// CHECK:       call [[SWIFTCC:swiftcc]] void @context_error_1(i32* swiftself [[X]], float** swifterror [[TEMP]])
+// CHECK:       [[T0:%.*]] = load float*, float** [[TEMP]], align 4
+// CHECK:       store float* [[T0]], float** [[ERROR]], align 4
+
+SWIFTCALL void context_error_2(short s, CONTEXT int *self, ERROR float **error) {}
+// CHECK-LABEL: define {{.*}} void @context_error_2(i16{{.*}}, i32* swiftself{{.*}}, float** swifterror)
+
+/*****************************************************************************/
+/********************************** LOWERING *********************************/
+/*****************************************************************************/
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+typedef float float8 __attribute__((ext_vector_type(8)));
+typedef double double2 __attribute__((ext_vector_type(2)));
+typedef double double4 __attribute__((ext_vector_type(4)));
+typedef int int3 __attribute__((ext_vector_type(3)));
+typedef int int4 __attribute__((ext_vector_type(4)));
+typedef int int5 __attribute__((ext_vector_type(5)));
+typedef int int8 __attribute__((ext_vector_type(8)));
+
+#define TEST(TYPE)                       \
+  SWIFTCALL TYPE return_##TYPE(void) {   \
+    TYPE result = {};                    \
+    return result;                       \
+  }                                      \
+  SWIFTCALL void take_##TYPE(TYPE v) {   \
+  }                                      \
+  void test_##TYPE() {                   \
+    take_##TYPE(return_##TYPE());        \
+  }
+
+/*****************************************************************************/
+/*********************************** STRUCTS *********************************/
+/*****************************************************************************/
+
+typedef struct {
+} struct_empty;
+TEST(struct_empty);
+// CHECK-LABEL: define {{.*}} @return_struct_empty()
+// CHECK:   ret void
+// CHECK-LABEL: define {{.*}} @take_struct_empty()
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  char c0;
+  char c1;
+  float f0;
+  float f1;
+} struct_1;
+TEST(struct_1);
+// CHECK-LABEL: define {{.*}} @return_struct_1()
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
+// CHECK:   @llvm.memset
+// CHECK:   @llvm.memcpy
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i16, \[2 x i8\], float, float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i16, i16* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i16, float, float }]] undef, i32 [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i16 [[SECOND]], 1
+// CHECK:   [[T2:%.*]] = insertvalue [[UAGG]] [[T1]], float [[THIRD]], 2
+// CHECK:   [[T3:%.*]] = insertvalue [[UAGG]] [[T2]], float [[FOURTH]], 3
+// CHECK:   ret [[UAGG]] [[T3]]
+// CHECK-LABEL: define {{.*}} @take_struct_1(i32, i16, float, float)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store i32 %0, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i16 %1, i16* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   store float %2, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   store float %3, float* [[T0]], align 4
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_struct_1()
+// CHECK:   [[TMP:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_struct_1()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i16 [[T1]], i16* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 2
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 3
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i16, i16* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_struct_1(i32 [[FIRST]], i16 [[SECOND]], float [[THIRD]], float [[FOURTH]])
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  char c0;
+  __attribute__((aligned(2))) char c1;
+  float f0;
+  float f1;
+} struct_2;
+TEST(struct_2);
+// CHECK-LABEL: define {{.*}} @return_struct_2()
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
+// CHECK:   @llvm.memcpy
+// CHECK:   @llvm.memcpy
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32, float, float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i32, float, float }]] undef, i32 [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
+// CHECK:   [[T2:%.*]] = insertvalue [[UAGG]] [[T1]], float [[THIRD]], 2
+// CHECK:   [[T3:%.*]] = insertvalue [[UAGG]] [[T2]], float [[FOURTH]], 3
+// CHECK:   ret [[UAGG]] [[T3]]
+// CHECK-LABEL: define {{.*}} @take_struct_2(i32, i32, float, float)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store i32 %0, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i32 %1, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   store float %2, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   store float %3, float* [[T0]], align 4
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_struct_2()
+// CHECK:   [[TMP:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_struct_2()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 2
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 3
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_struct_2(i32 [[FIRST]], i32 [[SECOND]], float [[THIRD]], float [[FOURTH]])
+// CHECK:   ret void
+
+// There's no way to put a field randomly in the middle of an otherwise
+// empty storage unit in C, so that case has to be tested in C++, which
+// can use empty structs to introduce arbitrary padding.  (In C, they end up
+// with size 0 and so don't affect layout.)
+
+// Misaligned data rule.
+typedef struct {
+  char c0;
+  __attribute__((packed)) float f;
+} struct_misaligned_1;
+TEST(struct_misaligned_1)
+// CHECK-LABEL: define {{.*}} @return_struct_misaligned_1()
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   @llvm.memset
+// CHECK:   @llvm.memcpy
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i8 }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i8, i8* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i8 }]] undef, i32 [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i8 [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_struct_misaligned_1(i32, i8)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store i32 %0, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i8 %1, i8* [[T0]], align
+// CHECK:   ret void
+
+// Too many scalars.
+typedef struct {
+  int x[5];
+} struct_big_1;
+TEST(struct_big_1)
+
+// CHECK-LABEL: define {{.*}} void @return_struct_big_1({{.*}} noalias sret
+
+// Should not be byval.
+// CHECK-LABEL: define {{.*}} void @take_struct_big_1({{.*}}*{{( %.*)?}})
+
+/*****************************************************************************/
+/********************************* TYPE MERGING ******************************/
+/*****************************************************************************/
+
+typedef union {
+  float f;
+  double d;
+} union_het_fp;
+TEST(union_het_fp)
+// CHECK-LABEL: define {{.*}} @return_union_het_fp()
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align 4
+// CHECK:   @llvm.memcpy
+// CHECK:   @llvm.memcpy
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, i32 }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i32 }]] undef, i32 [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_union_het_fp(i32, i32)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store i32 %0, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i32 %1, i32* [[T0]], align 4
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_union_het_fp()
+// CHECK:   [[TMP:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_union_het_fp()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_union_het_fp(i32 [[FIRST]], i32 [[SECOND]])
+// CHECK:   ret void
+
+
+typedef union {
+  float f1;
+  float f2;
+} union_hom_fp;
+TEST(union_hom_fp)
+// CHECK-LABEL: define void @test_union_hom_fp()
+// CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] float @return_union_hom_fp()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store float [[CALL]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_union_hom_fp(float [[FIRST]])
+// CHECK:   ret void
+
+typedef union {
+  float f1;
+  float4 fv2;
+} union_hom_fp_partial;
+TEST(union_hom_fp_partial)
+// CHECK-LABEL: define void @test_union_hom_fp_partial()
+// CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG:{ float, float, float, float }]] @return_union_hom_fp_partial()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ float, float, float, float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 2
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 3
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_union_hom_fp_partial(float [[FIRST]], float [[SECOND]], float [[THIRD]], float [[FOURTH]])
+// CHECK:   ret void
+
+typedef union {
+  struct { int x, y; } f1;
+  float4 fv2;
+} union_het_fpv_partial;
+TEST(union_het_fpv_partial)
+// CHECK-LABEL: define void @test_union_het_fpv_partial()
+// CHECK:   [[TMP:%.*]] = alloca [[REC:%.*]], align 16
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG:{ i32, i32, float, float }]] @return_union_het_fpv_partial()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG:{ i32, i32, float, float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store i32 [[T1]], i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 2
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 3
+// CHECK:   store float [[T1]], float* [[T0]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 3
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_union_het_fpv_partial(i32 [[FIRST]], i32 [[SECOND]], float [[THIRD]], float [[FOURTH]])
+// CHECK:   ret void
+
+/*****************************************************************************/
+/****************************** VECTOR LEGALIZATION **************************/
+/*****************************************************************************/
+
+TEST(int4)
+// CHECK-LABEL: define {{.*}} <4 x i32> @return_int4()
+// CHECK-LABEL: define {{.*}} @take_int4(<4 x i32>
+
+TEST(int8)
+// CHECK-LABEL: define {{.*}} @return_int8()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<8 x i32>]], align 32
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, <4 x i32> }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, <4 x i32> }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], <4 x i32> [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int8(<4 x i32>, <4 x i32>)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store <4 x i32> %1, <4 x i32>* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int8()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int8()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int8(<4 x i32> [[FIRST]], <4 x i32> [[SECOND]])
+// CHECK:   ret void
+
+TEST(int5)
+// CHECK-LABEL: define {{.*}} @return_int5()
+// CHECK:   [[RET:%.*]] = alloca [[REC:<5 x i32>]], align 32
+// CHECK:   [[VAR:%.*]] = alloca [[REC]], align
+// CHECK:   store
+// CHECK:   load
+// CHECK:   store
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ <4 x i32>, i32 }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ <4 x i32>, i32 }]] undef, <4 x i32> [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i32 [[SECOND]], 1
+// CHECK:   ret [[UAGG]] [[T1]]
+// CHECK-LABEL: define {{.*}} @take_int5(<4 x i32>, i32)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store <4 x i32> %0, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   store i32 %1, i32* [[T0]], align
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_int5()
+// CHECK:   [[TMP1:%.*]] = alloca [[REC]], align
+// CHECK:   [[TMP2:%.*]] = alloca [[REC]], align
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC]] [[UAGG]] @return_int5()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP1]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store <4 x i32> [[T1]], <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i32 [[T1]], i32* [[T0]], align
+// CHECK:   [[V:%.*]] = load [[REC]], [[REC]]* [[TMP1]], align
+// CHECK:   store [[REC]] [[V]], [[REC]]* [[TMP2]], align
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP2]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load <4 x i32>, <4 x i32>* [[T0]], align
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 1
+// CHECK:   [[SECOND:%.*]] = load i32, i32* [[T0]], align
+// CHECK:   call [[SWIFTCC]] void @take_int5(<4 x i32> [[FIRST]], i32 [[SECOND]])
+// CHECK:   ret void
+
+typedef struct {
+  int x;
+  int3 v __attribute__((packed));
+} misaligned_int3;
+TEST(misaligned_int3)
+// CHECK-LABEL: define {{.*}} @take_misaligned_int3(i32, i32, i32, i32)
diff --git a/test/CodeGen/arm-target-features.c b/test/CodeGen/arm-target-features.c
index 7829edff654e9..189c6f7010599 100644
--- a/test/CodeGen/arm-target-features.c
+++ b/test/CodeGen/arm-target-features.c
@@ -22,10 +22,12 @@
 
 
 // RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a35 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple armv8-linux-gnueabi -target-cpu cortex-a53 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a57 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a72 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
+// RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a73 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m1 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // CHECK-BASIC-V8: "target-features"="+crc,+crypto,+dsp,+fp-armv8,+hwdiv,+hwdiv-arm,+neon"
 
@@ -39,6 +41,7 @@
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
+// RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
 // CHECK-VFP3-D16-FP16-DIV: "target-features"="+d16,+dsp,+fp16,+hwdiv,+hwdiv-arm,+vfp3"
 
 
diff --git a/test/CodeGen/arm64-abi-vector.c b/test/CodeGen/arm64-abi-vector.c
index 29aeadb66da40..fd828d99d28e1 100644
--- a/test/CodeGen/arm64-abi-vector.c
+++ b/test/CodeGen/arm64-abi-vector.c
@@ -1,7 +1,9 @@
 // RUN: %clang_cc1 -triple arm64-apple-ios7 -target-abi darwinpcs -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-linux-android -emit-llvm -o - %s | FileCheck -check-prefix=ANDROID %s
 
 #include <stdarg.h>
 
+typedef __attribute__(( ext_vector_type(2) ))  char __char2;
 typedef __attribute__(( ext_vector_type(3) ))  char __char3;
 typedef __attribute__(( ext_vector_type(4) ))  char __char4;
 typedef __attribute__(( ext_vector_type(5) ))  char __char5;
@@ -13,6 +15,26 @@ typedef __attribute__(( ext_vector_type(3) ))  int __int3;
 typedef __attribute__(( ext_vector_type(5) ))  int __int5;
 typedef __attribute__(( ext_vector_type(3) ))  double __double3;
 
+// Passing legal vector types as varargs. Check that we've allocated the appropriate size
+double varargs_vec_2c(int fixed, ...) {
+// ANDROID: varargs_vec_2c
+// ANDROID: [[VAR:%.*]] = alloca <2 x i8>, align 2
+// ANDROID: [[AP_NEXT:%.*]] = getelementptr inbounds i8, i8* [[AP_CUR:%.*]], i64 8
+// ANDROID: bitcast i8* [[AP_CUR]] to <2 x i8>*
+  va_list ap;
+  double sum = fixed;
+  va_start(ap, fixed);
+  __char2 c3 = va_arg(ap, __char2);
+  sum = sum + c3.x + c3.y;
+  va_end(ap);
+  return sum;
+}
+
+double test_2c(__char2 *in) {
+// ANDROID: call double (i32, ...) @varargs_vec_2c(i32 3, i16 {{%.*}})
+  return varargs_vec_2c(3, *in);
+}
+
 double varargs_vec_3c(int fixed, ...) {
 // CHECK: varargs_vec_3c
 // CHECK: alloca <3 x i8>, align 4
diff --git a/test/CodeGen/arm64-arguments.c b/test/CodeGen/arm64-arguments.c
index 93a1a198955e4..f90b8e3b93a8d 100644
--- a/test/CodeGen/arm64-arguments.c
+++ b/test/CodeGen/arm64-arguments.c
@@ -714,3 +714,34 @@ int32x4_t test_toobig_hva(int n, ...) {
   struct TooBigHVA h = __builtin_va_arg(thelist, struct TooBigHVA);
   return h.d;
 }
+
+typedef __attribute__((__ext_vector_type__(3))) float float32x3_t;
+typedef struct { float32x3_t arr[4]; } HFAv3;
+
+float32x3_t test_hva_v3(int n, ...) {
+// CHECK-LABEL: define <3 x float> @test_hva_v3(i32 %n, ...)
+// CHECK: [[THELIST:%.*]] = alloca i8*
+// CHECK: [[CURLIST:%.*]] = load i8*, i8** [[THELIST]]
+
+  // HVA is not indirect, so occupies its full 16 bytes on the stack. but it
+  // must be properly aligned.
+// CHECK: [[ALIGN0:%.*]] = ptrtoint i8* [[CURLIST]] to i64
+// CHECK: [[ALIGN1:%.*]] = add i64 [[ALIGN0]], 15
+// CHECK: [[ALIGN2:%.*]] = and i64 [[ALIGN1]], -16
+// CHECK: [[ALIGNED_LIST:%.*]] = inttoptr i64 [[ALIGN2]] to i8*
+
+// CHECK: [[NEXTLIST:%.*]] = getelementptr inbounds i8, i8* [[ALIGNED_LIST]], i64 64
+// CHECK: store i8* [[NEXTLIST]], i8** [[THELIST]]
+
+// CHECK: bitcast i8* [[ALIGNED_LIST]] to %struct.HFAv3*
+  __builtin_va_list l;
+  __builtin_va_start(l, n);
+  HFAv3 r = __builtin_va_arg(l, HFAv3);
+  return r.arr[2];
+}
+
+float32x3_t test_hva_v3_call(HFAv3 *a) {
+// CHECK-LABEL: define <3 x float> @test_hva_v3_call(%struct.HFAv3* %a)
+// CHECK: call <3 x float> (i32, ...) @test_hva_v3(i32 1, [4 x <4 x float>] {{.*}})
+  return test_hva_v3(1, *a);
+}
diff --git a/test/CodeGen/arm64-be-bitfield.c b/test/CodeGen/arm64-be-bitfield.c
index 132239ab83130..081eab81e905a 100644
--- a/test/CodeGen/arm64-be-bitfield.c
+++ b/test/CodeGen/arm64-be-bitfield.c
@@ -1,6 +1,4 @@
-// REQUIRES: aarch64-registered-target
 // RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -emit-llvm -O0 -o - %s | FileCheck --check-prefix IR %s
-// RUN:  %clang_cc1 -triple aarch64_be-linux-gnu -ffreestanding -S -O1 -o - %s | FileCheck --check-prefix ARM %s
 
 struct bt3 { signed b2:10; signed b3:10; } b16;
 
@@ -10,6 +8,5 @@ signed callee_b0f(struct bt3 bp11) {
 // IR: store i64 [[ARG]], i64* [[PTR:%.*]], align 8
 // IR: [[BITCAST:%.*]] = bitcast i64* [[PTR]] to i8*
 // IR: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}}, i8* [[BITCAST]], i64 4
-// ARM: asr x0, x0, #54
   return bp11.b2;
 }
diff --git a/test/CodeGen/arm64-crc32.c b/test/CodeGen/arm64-crc32.c
index 37ced18d76218..efb51ed407f8b 100644
--- a/test/CodeGen/arm64-crc32.c
+++ b/test/CodeGen/arm64-crc32.c
@@ -1,6 +1,6 @@
 // REQUIRES: aarch64-registered-target
 // RUN: %clang_cc1 -triple arm64-none-linux-gnu \
-// RUN:   -O3 -S -emit-llvm -o - %s | FileCheck %s
+// RUN:   -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 int crc32b(int a, char b)
 {
diff --git a/test/CodeGen/arm64-lanes.c b/test/CodeGen/arm64-lanes.c
index 4e80df9d6c521..ea47bae69f470 100644
--- a/test/CodeGen/arm64-lanes.c
+++ b/test/CodeGen/arm64-lanes.c
@@ -1,74 +1,127 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | FileCheck %s --check-prefix CHECK-BE
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64_be-linux-gnu -target-feature +neon -ffreestanding -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix CHECK-BE
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: @test_vdupb_lane_s8
 int8_t test_vdupb_lane_s8(int8x8_t src) {
   return vdupb_lane_s8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_s8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_s8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vdupb_lane_u8
 uint8_t test_vdupb_lane_u8(uint8x8_t src) {
   return vdupb_lane_u8(src, 2);
+  // CHECK-LABEL: @test_vdupb_lane_u8
   // CHECK: extractelement <8 x i8> %src, i32 2
-  // CHECK-BE: extractelement <8 x i8> %src, i32 5
+
+  // CHECK-BE-LABEL: @test_vdupb_lane_u8
+  // CHECK-BE: [[REV:%.*]] = shufflevector <8 x i8> {{.*}}, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: extractelement <8 x i8> [[REV]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_s16
 int16_t test_vduph_lane_s16(int16x4_t src) {
   return vduph_lane_s16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_s16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_s16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vduph_lane_u16
 uint16_t test_vduph_lane_u16(uint16x4_t src) {
   return vduph_lane_u16(src, 2);
-  // CHECK: extractelement <4 x i16> %src, i32 2
-  // CHECK-BE: extractelement <4 x i16> %src, i32 1
+  // CHECK-LABEL: @test_vduph_lane_u16
+  // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK: extractelement <4 x i16> [[TMP2]], i32 2
+
+  // CHECK-BE-LABEL: @test_vduph_lane_u16
+  // CHECK-BE: [[REV:%.*]] = shufflevector <4 x i16> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <4 x i16> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <4 x i16>
+  // CHECK-BE: extractelement <4 x i16> [[TMP2]], i32 2
 }
 
-// CHECK-LABEL: @test_vdups_lane_s32
 int32_t test_vdups_lane_s32(int32x2_t src) {
   return vdups_lane_s32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_s32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_s32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_u32
 uint32_t test_vdups_lane_u32(uint32x2_t src) {
   return vdups_lane_u32(src, 0);
-  // CHECK: extractelement <2 x i32> %src, i32 0
-  // CHECK-BE: extractelement <2 x i32> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_u32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK: extractelement <2 x i32> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_u32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x i32> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x i32> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x i32>
+  // CHECK-BE: extractelement <2 x i32> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdups_lane_f32
 float32_t test_vdups_lane_f32(float32x2_t src) {
   return vdups_lane_f32(src, 0);
-  // CHECK: extractelement <2 x float> %src, i32 0
-  // CHECK-BE: extractelement <2 x float> %src, i32 1
+  // CHECK-LABEL: @test_vdups_lane_f32
+  // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK: extractelement <2 x float> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdups_lane_f32
+  // CHECK-BE: [[REV:%.*]] = shufflevector <2 x float> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK-BE: [[TMP1:%.*]] = bitcast <2 x float> [[REV]] to [[TYPE:.*]]
+  // CHECK-BE: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <2 x float>
+  // CHECK-BE: extractelement <2 x float> [[TMP2]], i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_s64
 int64_t test_vdupd_lane_s64(int64x1_t src) {
   return vdupd_lane_s64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_s64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_s64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_u64
 uint64_t test_vdupd_lane_u64(uint64x1_t src) {
   return vdupd_lane_u64(src, 0);
-  // CHECK: extractelement <1 x i64> %src, i32 0
-  // CHECK-BE: extractelement <1 x i64> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_u64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x i64> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x i64>
+  // CHECK: extractelement <1 x i64> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_u64
+  // CHECK-BE: extractelement <1 x i64> {{.*}}, i32 0
 }
 
-// CHECK-LABEL: @test_vdupd_lane_f64
 float64_t test_vdupd_lane_f64(float64x1_t src) {
   return vdupd_lane_f64(src, 0);
-  // CHECK: extractelement <1 x double> %src, i32 0
-  // CHECK-BE: extractelement <1 x double> %src, i32 0
+  // CHECK-LABEL: @test_vdupd_lane_f64
+  // CHECK: [[TMP1:%.*]] = bitcast <1 x double> %src to [[TYPE:.*]]
+  // CHECK: [[TMP2:%.*]] = bitcast [[TYPE]] [[TMP1]] to <1 x double>
+  // CHECK: extractelement <1 x double> [[TMP2]], i32 0
+
+  // CHECK-BE-LABEL: @test_vdupd_lane_f64
+  // CHECK-BE: extractelement <1 x double> {{.*}}, i32 0
 }
diff --git a/test/CodeGen/arm64-scalar-test.c b/test/CodeGen/arm64-scalar-test.c
deleted file mode 100644
index e2328b18ed016..0000000000000
--- a/test/CodeGen/arm64-scalar-test.c
+++ /dev/null
@@ -1,547 +0,0 @@
-// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon  \
-// RUN:   -S -O1 -o - -ffreestanding %s | FileCheck %s
-
-// We're explicitly using arm_neon.h here: some types probably don't match
-// the ACLE definitions, but we want to check current codegen.
-#include <arm_neon.h>
-
-float test_vrsqrtss_f32(float a, float b) {
-// CHECK: test_vrsqrtss_f32
-  return vrsqrtss_f32(a, b);
-// CHECK: frsqrts {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-double test_vrsqrtsd_f64(double a, double b) {
-// CHECK: test_vrsqrtsd_f64
-  return vrsqrtsd_f64(a, b);
-// CHECK: frsqrts {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
-// CHECK: test_vrshl_s64
-  return vrshl_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
-// CHECK: test_vrshl_u64
-  return vrshl_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vrshld_s64
-int64_t test_vrshld_s64(int64_t a, int64_t b) {
-  return vrshld_s64(a, b);
-// CHECK: srshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vrshld_u64
-uint64_t test_vrshld_u64(uint64_t a, uint64_t b) {
-  return vrshld_u64(a, b);
-// CHECK: urshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqrshlb_s8
-int8_t test_vqrshlb_s8(int8_t a, int8_t b) {
-  return vqrshlb_s8(a, b);
-// CHECK: sqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshlh_s16
-int16_t test_vqrshlh_s16(int16_t a, int16_t b) {
-  return vqrshlh_s16(a, b);
-// CHECK: sqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshls_s32
-int32_t test_vqrshls_s32(int32_t a, int32_t b) {
-  return vqrshls_s32(a, b);
-// CHECK: sqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqrshld_s64
-int64_t test_vqrshld_s64(int64_t a, int64_t b) {
-  return vqrshld_s64(a, b);
-// CHECK: sqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqrshlb_u8
-uint8_t test_vqrshlb_u8(uint8_t a, uint8_t b) {
-  return vqrshlb_u8(a, b);
-// CHECK: uqrshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshlh_u16
-uint16_t test_vqrshlh_u16(uint16_t a, uint16_t b) {
-  return vqrshlh_u16(a, b);
-// CHECK: uqrshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqrshls_u32
-uint32_t test_vqrshls_u32(uint32_t a, uint32_t b) {
-  return vqrshls_u32(a, b);
-// CHECK: uqrshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqrshld_u64
-uint64_t test_vqrshld_u64(uint64_t a, uint64_t b) {
-  return vqrshld_u64(a, b);
-// CHECK: uqrshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshlb_s8
-int8_t test_vqshlb_s8(int8_t a, int8_t b) {
-  return vqshlb_s8(a, b);
-// CHECK: sqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshlh_s16
-int16_t test_vqshlh_s16(int16_t a, int16_t b) {
-  return vqshlh_s16(a, b);
-// CHECK: sqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshls_s32
-int32_t test_vqshls_s32(int32_t a, int32_t b) {
-  return vqshls_s32(a, b);
-// CHECK: sqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqshld_s64
-int64_t test_vqshld_s64(int64_t a, int64_t b) {
-  return vqshld_s64(a, b);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshld_s64_i
-int64_t test_vqshld_s64_i(int64_t a) {
-  return vqshld_s64(a, 36);
-// CHECK: sqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-}
-
-// CHECK: test_vqshlb_u8
-uint8_t test_vqshlb_u8(uint8_t a, uint8_t b) {
-  return vqshlb_u8(a, b);
-// CHECK: uqshl.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshlh_u16
-uint16_t test_vqshlh_u16(uint16_t a, uint16_t b) {
-  return vqshlh_u16(a, b);
-// CHECK: uqshl.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqshls_u32
-uint32_t test_vqshls_u32(uint32_t a, uint32_t b) {
-  return vqshls_u32(a, b);
-// CHECK: uqshl {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqshld_u64
-uint64_t test_vqshld_u64(uint64_t a, uint64_t b) {
-  return vqshld_u64(a, b);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqshld_u64_i
-uint64_t test_vqshld_u64_i(uint64_t a) {
-  return vqshld_u64(a, 36);
-// CHECK: uqshl {{d[0-9]+}}, {{d[0-9]+}}, #36
-}
-
-// CHECK: test_vshld_u64
-uint64_t test_vshld_u64(uint64_t a, uint64_t b) {
-  return vshld_u64(a, b);
-// CHECK: ushl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vshld_s64
-int64_t test_vshld_s64(int64_t a, int64_t b) {
-  return vshld_s64(a, b);
-// CHECK: sshl {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqdmullh_s16
-int32_t test_vqdmullh_s16(int16_t a, int16_t b) {
-  return vqdmullh_s16(a, b);
-// CHECK: sqdmull.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqdmulls_s32
-int64_t test_vqdmulls_s32(int32_t a, int32_t b) {
-  return vqdmulls_s32(a, b);
-// CHECK: sqdmull {{d[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddb_s8
-int8_t test_vqaddb_s8(int8_t a, int8_t b) {
-  return vqaddb_s8(a, b);
-// CHECK: sqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqaddh_s16
-int16_t test_vqaddh_s16(int16_t a, int16_t b) {
-  return vqaddh_s16(a, b);
-// CHECK: sqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqadds_s32
-int32_t test_vqadds_s32(int32_t a, int32_t b) {
-  return vqadds_s32(a, b);
-// CHECK: sqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddd_s64
-int64_t test_vqaddd_s64(int64_t a, int64_t b) {
-  return vqaddd_s64(a, b);
-// CHECK: sqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqaddb_u8
-uint8_t test_vqaddb_u8(uint8_t a, uint8_t b) {
-  return vqaddb_u8(a, b);
-// CHECK: uqadd.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqaddh_u16
-uint16_t test_vqaddh_u16(uint16_t a, uint16_t b) {
-  return vqaddh_u16(a, b);
-// CHECK: uqadd.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqadds_u32
-uint32_t test_vqadds_u32(uint32_t a, uint32_t b) {
-  return vqadds_u32(a, b);
-// CHECK: uqadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqaddd_u64
-uint64_t test_vqaddd_u64(uint64_t a, uint64_t b) {
-  return vqaddd_u64(a, b);
-// CHECK: uqadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqsubb_s8
-int8_t test_vqsubb_s8(int8_t a, int8_t b) {
-  return vqsubb_s8(a, b);
-// CHECK: sqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubh_s16
-int16_t test_vqsubh_s16(int16_t a, int16_t b) {
-  return vqsubh_s16(a, b);
-// CHECK: sqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubs_s32
-int32_t test_vqsubs_s32(int32_t a, int32_t b) {
-  return vqsubs_s32(a, b);
-// CHECK: sqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqsubd_s64
-int64_t test_vqsubd_s64(int64_t a, int64_t b) {
-  return vqsubd_s64(a, b);
-// CHECK: sqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqsubb_u8
-uint8_t test_vqsubb_u8(uint8_t a, uint8_t b) {
-  return vqsubb_u8(a, b);
-// CHECK: uqsub.8b {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubh_u16
-uint16_t test_vqsubh_u16(uint16_t a, uint16_t b) {
-  return vqsubh_u16(a, b);
-// CHECK: uqsub.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqsubs_u32
-uint32_t test_vqsubs_u32(uint32_t a, uint32_t b) {
-  return vqsubs_u32(a, b);
-// CHECK: uqsub {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqsubd_u64
-uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
-  return vqsubd_u64(a, b);
-// CHECK: uqsub {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovnh_s16
-int8_t test_vqmovnh_s16(int16_t a) {
-  return vqmovnh_s16(a);
-// CHECK: sqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovnh_u16
-uint8_t test_vqmovnh_u16(uint16_t a) {
-  return vqmovnh_u16(a);
-// CHECK: uqxtn.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovns_s32
-int16_t test_vqmovns_s32(int32_t a) {
-  return vqmovns_s32(a);
-// CHECK: sqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovns_u32
-uint16_t test_vqmovns_u32(uint32_t a) {
-  return vqmovns_u32(a);
-// CHECK: uqxtn.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovnd_s64
-int32_t test_vqmovnd_s64(int64_t a) {
-  return vqmovnd_s64(a);
-// CHECK: sqxtn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovnd_u64
-uint32_t test_vqmovnd_u64(uint64_t a) {
-  return vqmovnd_u64(a);
-// CHECK: uqxtn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqmovunh_s16
-int8_t test_vqmovunh_s16(int16_t a) {
-  return vqmovunh_s16(a);
-// CHECK: sqxtun.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovuns_s32
-int16_t test_vqmovuns_s32(int32_t a) {
-  return vqmovuns_s32(a);
-// CHECK: sqxtun.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqmovund_s64
-int32_t test_vqmovund_s64(int64_t a) {
-  return vqmovund_s64(a);
-// CHECK: sqxtun {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqabsb_s8
-int8_t test_vqabsb_s8(int8_t a) {
-  return vqabsb_s8(a);
-// CHECK: sqabs.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqabsh_s16
-int16_t test_vqabsh_s16(int16_t a) {
-  return vqabsh_s16(a);
-// CHECK: sqabs.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqabss_s32
-int32_t test_vqabss_s32(int32_t a) {
-  return vqabss_s32(a);
-// CHECK: sqabs {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqabsd_s64
-int64_t test_vqabsd_s64(int64_t a) {
-  return vqabsd_s64(a);
-// CHECK: sqabs {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vqnegb_s8
-int8_t test_vqnegb_s8(int8_t a) {
-  return vqnegb_s8(a);
-// CHECK: sqneg.8b {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqnegh_s16
-int16_t test_vqnegh_s16(int16_t a) {
-  return vqnegh_s16(a);
-// CHECK: sqneg.4h {{v[0-9]+}}, {{v[0-9]+}}
-}
-
-// CHECK: test_vqnegs_s32
-int32_t test_vqnegs_s32(int32_t a) {
-  return vqnegs_s32(a);
-// CHECK: sqneg {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vqnegd_s64
-int64_t test_vqnegd_s64(int64_t a) {
-  return vqnegd_s64(a);
-// CHECK: sqneg {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvts_n_f32_s32
-float32_t test_vcvts_n_f32_s32(int32_t a) {
-  return vcvts_n_f32_s32(a, 3);
-// CHECK: scvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_f32_u32
-float32_t test_vcvts_n_f32_u32(uint32_t a) {
-  return vcvts_n_f32_u32(a, 3);
-// CHECK: ucvtf {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_f64_s64
-float64_t test_vcvtd_n_f64_s64(int64_t a) {
-  return vcvtd_n_f64_s64(a, 3);
-// CHECK: scvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_f64_u64
-float64_t test_vcvtd_n_f64_u64(uint64_t a) {
-  return vcvtd_n_f64_u64(a, 3);
-// CHECK: ucvtf {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_s32_f32
-int32_t test_vcvts_n_s32_f32(float32_t a) {
-  return vcvts_n_s32_f32(a, 3);
-// CHECK: fcvtzs {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvts_n_u32_f32
-uint32_t test_vcvts_n_u32_f32(float32_t a) {
-  return vcvts_n_u32_f32(a, 3);
-// CHECK: fcvtzu {{s[0-9]+}}, {{s[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_s64_f64
-int64_t test_vcvtd_n_s64_f64(float64_t a) {
-  return vcvtd_n_s64_f64(a, 3);
-// CHECK: fcvtzs {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtd_n_u64_f64
-uint64_t test_vcvtd_n_u64_f64(float64_t a) {
-  return vcvtd_n_u64_f64(a, 3);
-// CHECK: fcvtzu {{d[0-9]+}}, {{d[0-9]+}}, #3
-}
-
-// CHECK: test_vcvtas_s32_f32
-int32_t test_vcvtas_s32_f32(float32_t a) {
-  return vcvtas_s32_f32(a);
-// CHECK: fcvtas {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtas_u32_f32
-uint32_t test_vcvtas_u32_f32(float32_t a) {
-  return vcvtas_u32_f32(a);
-// CHECK: fcvtau {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtad_s64_f64
-int64_t test_vcvtad_s64_f64(float64_t a) {
-  return vcvtad_s64_f64(a);
-// CHECK: fcvtas {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtad_u64_f64
-uint64_t test_vcvtad_u64_f64(float64_t a) {
-  return vcvtad_u64_f64(a);
-// CHECK: fcvtau {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtms_s32_f32
-int32_t test_vcvtms_s32_f32(float32_t a) {
-  return vcvtms_s32_f32(a);
-// CHECK: fcvtms {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtms_u32_f32
-uint32_t test_vcvtms_u32_f32(float32_t a) {
-  return vcvtms_u32_f32(a);
-// CHECK: fcvtmu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtmd_s64_f64
-int64_t test_vcvtmd_s64_f64(float64_t a) {
-  return vcvtmd_s64_f64(a);
-// CHECK: fcvtms {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtmd_u64_f64
-uint64_t test_vcvtmd_u64_f64(float64_t a) {
-  return vcvtmd_u64_f64(a);
-// CHECK: fcvtmu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtns_s32_f32
-int32_t test_vcvtns_s32_f32(float32_t a) {
-  return vcvtns_s32_f32(a);
-// CHECK: fcvtns {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtns_u32_f32
-uint32_t test_vcvtns_u32_f32(float32_t a) {
-  return vcvtns_u32_f32(a);
-// CHECK: fcvtnu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtnd_s64_f64
-int64_t test_vcvtnd_s64_f64(float64_t a) {
-  return vcvtnd_s64_f64(a);
-// CHECK: fcvtns {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtnd_u64_f64
-uint64_t test_vcvtnd_u64_f64(float64_t a) {
-  return vcvtnd_u64_f64(a);
-// CHECK: fcvtnu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtps_s32_f32
-int32_t test_vcvtps_s32_f32(float32_t a) {
-  return vcvtps_s32_f32(a);
-// CHECK: fcvtps {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtps_u32_f32
-uint32_t test_vcvtps_u32_f32(float32_t a) {
-  return vcvtps_u32_f32(a);
-// CHECK: fcvtpu {{w[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vcvtpd_s64_f64
-int64_t test_vcvtpd_s64_f64(float64_t a) {
-  return vcvtpd_s64_f64(a);
-// CHECK: fcvtps {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtpd_u64_f64
-uint64_t test_vcvtpd_u64_f64(float64_t a) {
-  return vcvtpd_u64_f64(a);
-// CHECK: fcvtpu {{x[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vcvtxd_f32_f64
-float32_t test_vcvtxd_f32_f64(float64_t a) {
-  return vcvtxd_f32_f64(a);
-// CHECK: fcvtxn {{s[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vabds_f32
-float32_t test_vabds_f32(float32_t a, float32_t b) {
-  return vabds_f32(a, b);
-  // CHECK: fabd {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vabdd_f64
-float64_t test_vabdd_f64(float64_t a, float64_t b) {
-  return vabdd_f64(a, b);
-  // CHECK: fabd {{d[0-9]+}}, {{d[0-9]+}}
-}
-
-// CHECK: test_vmulxs_f32
-float32_t test_vmulxs_f32(float32_t a, float32_t b) {
-  return vmulxs_f32(a, b);
-  // CHECK: fmulx {{s[0-9]+}}, {{s[0-9]+}}
-}
-
-// CHECK: test_vmulxd_f64
-float64_t test_vmulxd_f64(float64_t a, float64_t b) {
-  return vmulxd_f64(a, b);
-  // CHECK: fmulx {{d[0-9]+}}, {{d[0-9]+}}
-}
diff --git a/test/CodeGen/arm64-vrsqrt.c b/test/CodeGen/arm64-vrsqrt.c
deleted file mode 100644
index 821c23cbf04be..0000000000000
--- a/test/CodeGen/arm64-vrsqrt.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -emit-llvm -O1 -o - %s | FileCheck %s
-
-#include <arm_neon.h>
-
-uint32x2_t test_vrsqrte_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vrsqrte_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.ursqrte.v2i32(<2 x i32> %in)
-  return vrsqrte_u32(in);
-}
-
-float32x2_t test_vrsqrte_f32(float32x2_t in) {
-  // CHECK-LABEL: @test_vrsqrte_f32
-  // CHECK: call <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %in)
-  return vrsqrte_f32(in);
-}
-
-
-uint32x4_t test_vrsqrteq_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vrsqrteq_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.ursqrte.v4i32(<4 x i32> %in)
-  return vrsqrteq_u32(in);
-}
-
-float32x4_t test_vrsqrteq_f32(float32x4_t in) {
-  // CHECK-LABEL: @test_vrsqrteq_f32
-  // CHECK: call <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %in)
-  return vrsqrteq_f32(in);
-}
-
-
-float32x2_t test_vrsqrts_f32(float32x2_t est, float32x2_t val) {
-  // CHECK-LABEL: @test_vrsqrts_f32
-  // CHECK: call <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %est, <2 x float> %val)
-  return vrsqrts_f32(est, val);
-}
-
-
-float32x4_t test_vrsqrtsq_f32(float32x4_t est, float32x4_t val) {
-  // CHECK-LABEL: @test_vrsqrtsq_f32
-  // CHECK: call <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %est, <4 x float> %val)
-  return vrsqrtsq_f32(est, val);
-}
-
diff --git a/test/CodeGen/arm64_neon_high_half.c b/test/CodeGen/arm64_neon_high_half.c
deleted file mode 100644
index 6008ba5e55c8b..0000000000000
--- a/test/CodeGen/arm64_neon_high_half.c
+++ /dev/null
@@ -1,559 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -Os -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-
-#include <arm_neon.h>
-
-int16x8_t test_vaddw_high_s8(int16x8_t lhs, int8x16_t rhs) {
-  // CHECK: saddw2.8h
-  return vaddw_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vaddw_high_s16(int32x4_t lhs, int16x8_t rhs) {
-  // CHECK: saddw2.4s
-  return vaddw_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vaddw_high_s32(int64x2_t lhs, int32x4_t rhs) {
-  // CHECK: saddw2.2d
-  return vaddw_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vaddw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
-  // CHECK: uaddw2.8h
-  return vaddw_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vaddw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
-  // CHECK: uaddw2.4s
-  return vaddw_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vaddw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
-  // CHECK: uaddw2.2d
-  return vaddw_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vsubw_high_s8(int16x8_t lhs, int8x16_t rhs) {
-  // CHECK: ssubw2.8h
-  return vsubw_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vsubw_high_s16(int32x4_t lhs, int16x8_t rhs) {
-  // CHECK: ssubw2.4s
-  return vsubw_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vsubw_high_s32(int64x2_t lhs, int32x4_t rhs) {
-  // CHECK: ssubw2.2d
-  return vsubw_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vsubw_high_u8(uint16x8_t lhs, uint8x16_t rhs) {
-  // CHECK: usubw2.8h
-  return vsubw_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vsubw_high_u16(uint32x4_t lhs, uint16x8_t rhs) {
-  // CHECK: usubw2.4s
-  return vsubw_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vsubw_high_u32(uint64x2_t lhs, uint32x4_t rhs) {
-  // CHECK: usubw2.2d
-  return vsubw_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vabdl_high_s8(int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: sabdl2.8h
-  return vabdl_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vabdl_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sabdl2.4s
-  return vabdl_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vabdl_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sabdl2.2d
-  return vabdl_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vabdl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: uabdl2.8h
-  return vabdl_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vabdl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: uabdl2.4s
-  return vabdl_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vabdl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: uabdl2.2d
-  return vabdl_high_u32(lhs, rhs);
-}
-
-int16x8_t test_vabal_high_s8(int16x8_t accum, int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: sabal2.8h
-  return vabal_high_s8(accum, lhs, rhs);
-}
-
-int32x4_t test_vabal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sabal2.4s
-  return vabal_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vabal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sabal2.2d
-  return vabal_high_s32(accum, lhs, rhs);
-}
-
-uint16x8_t test_vabal_high_u8(uint16x8_t accum, uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: uabal2.8h
-  return vabal_high_u8(accum, lhs, rhs);
-}
-
-uint32x4_t test_vabal_high_u16(uint32x4_t accum, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: uabal2.4s
-  return vabal_high_u16(accum, lhs, rhs);
-}
-
-uint64x2_t test_vabal_high_u32(uint64x2_t accum, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: uabal2.2d
-  return vabal_high_u32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmlal_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmlal2.4s
-  return vqdmlal_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vqdmlal_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmlal2.2d
-  return vqdmlal_high_s32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmlsl_high_s16(int32x4_t accum, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmlsl2.4s
-  return vqdmlsl_high_s16(accum, lhs, rhs);
-}
-
-int64x2_t test_vqdmlsl_high_s32(int64x2_t accum, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmlsl2.2d
-  return vqdmlsl_high_s32(accum, lhs, rhs);
-}
-
-int32x4_t test_vqdmull_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: sqdmull2.4s
-  return vqdmull_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vqdmull_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: sqdmull2.2d
-  return vqdmull_high_s32(lhs, rhs);
-}
-
-int16x8_t test_vshll_high_n_s8(int8x16_t in) {
-  // CHECK: sshll2.8h
-  return vshll_high_n_s8(in, 7);
-}
-
-int32x4_t test_vshll_high_n_s16(int16x8_t in) {
-  // CHECK: sshll2.4s
-  return vshll_high_n_s16(in, 15);
-}
-
-int64x2_t test_vshll_high_n_s32(int32x4_t in) {
-  // CHECK: sshll2.2d
-  return vshll_high_n_s32(in, 31);
-}
-
-int16x8_t test_vshll_high_n_u8(int8x16_t in) {
-  // CHECK: ushll2.8h
-  return vshll_high_n_u8(in, 7);
-}
-
-int32x4_t test_vshll_high_n_u16(int16x8_t in) {
-  // CHECK: ushll2.4s
-  return vshll_high_n_u16(in, 15);
-}
-
-int64x2_t test_vshll_high_n_u32(int32x4_t in) {
-  // CHECK: ushll2.2d
-  return vshll_high_n_u32(in, 31);
-}
-
-int16x8_t test_vshll_high_n_s8_max(int8x16_t in) {
-  // CHECK: shll2.8h
-  return vshll_high_n_s8(in, 8);
-}
-
-int32x4_t test_vshll_high_n_s16_max(int16x8_t in) {
-  // CHECK: shll2.4s
-  return vshll_high_n_s16(in, 16);
-}
-
-int64x2_t test_vshll_high_n_s32_max(int32x4_t in) {
-  // CHECK: shll2.2d
-  return vshll_high_n_s32(in, 32);
-}
-
-int16x8_t test_vshll_high_n_u8_max(int8x16_t in) {
-  // CHECK: shll2.8h
-  return vshll_high_n_u8(in, 8);
-}
-
-int32x4_t test_vshll_high_n_u16_max(int16x8_t in) {
-  // CHECK: shll2.4s
-  return vshll_high_n_u16(in, 16);
-}
-
-int64x2_t test_vshll_high_n_u32_max(int32x4_t in) {
-  // CHECK: shll2.2d
-  return vshll_high_n_u32(in, 32);
-}
-
-int16x8_t test_vsubl_high_s8(int8x16_t lhs, int8x16_t rhs) {
-  // CHECK: ssubl2.8h
-  return vsubl_high_s8(lhs, rhs);
-}
-
-int32x4_t test_vsubl_high_s16(int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: ssubl2.4s
-  return vsubl_high_s16(lhs, rhs);
-}
-
-int64x2_t test_vsubl_high_s32(int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: ssubl2.2d
-  return vsubl_high_s32(lhs, rhs);
-}
-
-uint16x8_t test_vsubl_high_u8(uint8x16_t lhs, uint8x16_t rhs) {
-  // CHECK: usubl2.8h
-  return vsubl_high_u8(lhs, rhs);
-}
-
-uint32x4_t test_vsubl_high_u16(uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: usubl2.4s
-  return vsubl_high_u16(lhs, rhs);
-}
-
-uint64x2_t test_vsubl_high_u32(uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: usubl2.2d
-  return vsubl_high_u32(lhs, rhs);
-}
-
-int8x16_t test_vrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: rshrn2.16b
-  return vrshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: rshrn2.8h
-  return vrshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: shrn2.4s
-  return vrshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: rshrn2.16b
-  return vrshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: rshrn2.8h
-  return vrshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: rshrn2.4s
-  return vrshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: shrn2.16b
-  return vshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: shrn2.8h
-  return vshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: shrn2.4s
-  return vshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: shrn2.16b
-  return vshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: shrn2.8h
-  return vshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: shrn2.4s
-  return vshrn_high_n_u64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqshrun2.16b
-  return vqshrun_high_n_s16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqshrun2.8h
-  return vqshrun_high_n_s32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqshrun2.4s
-  return vqshrun_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqrshrun_high_n_s16(uint8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqrshrun2.16b
-  return vqrshrun_high_n_s16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqrshrun_high_n_s32(uint16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqrshrun2.8h
-  return vqrshrun_high_n_s32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqrshrun_high_n_s64(uint32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqrshrun2.4s
-  return vqrshrun_high_n_s64(lowpart, input, 2);
-}
-
-int8x16_t test_vqshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqshrn2.16b
-  return vqshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vqshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqshrn2.8h
-  return vqshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vqshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqshrn2.4s
-  return vqshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: uqshrn2.16b
-  return vqshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: uqshrn2.8h
-  return vqshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: uqshrn2.4s
-  return vqshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vqrshrn_high_n_s16(int8x8_t lowpart, int16x8_t input) {
-  // CHECK: sqrshrn2.16b
-  return vqrshrn_high_n_s16(lowpart, input, 2);
-}
-
-int16x8_t test_vqrshrn_high_n_s32(int16x4_t lowpart, int32x4_t input) {
-  // CHECK: sqrshrn2.8h
-  return vqrshrn_high_n_s32(lowpart, input, 2);
-}
-
-int32x4_t test_vqrshrn_high_n_s64(int32x2_t lowpart, int64x2_t input) {
-  // CHECK: sqrshrn2.4s
-  return vqrshrn_high_n_s64(lowpart, input, 2);
-}
-
-uint8x16_t test_vqrshrn_high_n_u16(uint8x8_t lowpart, uint16x8_t input) {
-  // CHECK: uqrshrn2.16b
-  return vqrshrn_high_n_u16(lowpart, input, 2);
-}
-
-uint16x8_t test_vqrshrn_high_n_u32(uint16x4_t lowpart, uint32x4_t input) {
-  // CHECK: uqrshrn2.8h
-  return vqrshrn_high_n_u32(lowpart, input, 2);
-}
-
-uint32x4_t test_vqrshrn_high_n_u64(uint32x2_t lowpart, uint64x2_t input) {
-  // CHECK: uqrshrn2.4s
-  return vqrshrn_high_n_u64(lowpart, input, 2);
-}
-
-int8x16_t test_vaddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: addhn2.16b v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s16(lowpart, lhs, rhs);
-}
-
-int16x8_t test_vaddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: addhn2.8h v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s32(lowpart, lhs, rhs);
-}
-
-int32x4_t test_vaddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
-  // CHECK: addhn2.4s v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s64(lowpart, lhs, rhs);
-}
-
-uint8x16_t test_vaddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: addhn2.16b v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s16(lowpart, lhs, rhs);
-}
-
-uint16x8_t test_vaddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: addhn2.8h v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s32(lowpart, lhs, rhs);
-}
-
-uint32x4_t test_vaddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
-  // CHECK: addhn2.4s v0, {{v1, v2|v2, v1}}
-  return vaddhn_high_s64(lowpart, lhs, rhs);
-}
-
-int8x16_t test_vraddhn_high_s16(int8x8_t lowpart, int16x8_t lhs, int16x8_t rhs) {
-  // CHECK: raddhn2.16b v0, v1, v2
-  return vraddhn_high_s16(lowpart, lhs, rhs);
-}
-
-int16x8_t test_vraddhn_high_s32(int16x4_t lowpart, int32x4_t lhs, int32x4_t rhs) {
-  // CHECK: raddhn2.8h v0, v1, v2
-  return vraddhn_high_s32(lowpart, lhs, rhs);
-}
-
-int32x4_t test_vraddhn_high_s64(int32x2_t lowpart, int64x2_t lhs, int64x2_t rhs) {
-  // CHECK: raddhn2.4s v0, v1, v2
-  return vraddhn_high_s64(lowpart, lhs, rhs);
-}
-
-uint8x16_t test_vraddhn_high_u16(uint8x8_t lowpart, uint16x8_t lhs, uint16x8_t rhs) {
-  // CHECK: raddhn2.16b v0, v1, v2
-  return vraddhn_high_s16(lowpart, lhs, rhs);
-}
-
-uint16x8_t test_vraddhn_high_u32(uint16x4_t lowpart, uint32x4_t lhs, uint32x4_t rhs) {
-  // CHECK: raddhn2.8h v0, v1, v2
-  return vraddhn_high_s32(lowpart, lhs, rhs);
-}
-
-uint32x4_t test_vraddhn_high_u64(uint32x2_t lowpart, uint64x2_t lhs, uint64x2_t rhs) {
-  // CHECK: raddhn2.4s v0, v1, v2
-  return vraddhn_high_s64(lowpart, lhs, rhs);
-}
-
-int8x16_t test_vmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
-  // CHECK: xtn2.16b v0, v1
-  return vmovn_high_s16(lowpart, wide);
-}
-
-int16x8_t test_vmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
-  // CHECK: xtn2.8h v0, v1
-  return vmovn_high_s32(lowpart, wide);
-}
-
-int32x4_t test_vmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
-  // CHECK: xtn2.4s v0, v1
-  return vmovn_high_s64(lowpart, wide);
-}
-
-uint8x16_t test_vmovn_high_u16(uint8x8_t lowpart, uint16x8_t wide) {
-  // CHECK: xtn2.16b v0, v1
-  return vmovn_high_u16(lowpart, wide);
-}
-
-uint16x8_t test_vmovn_high_u32(uint16x4_t lowpart, uint32x4_t wide) {
-  // CHECK: xtn2.8h v0, v1
-  return vmovn_high_u32(lowpart, wide);
-}
-
-uint32x4_t test_vmovn_high_u64(uint32x2_t lowpart, uint64x2_t wide) {
-  // CHECK: xtn2.4s v0, v1
-  return vmovn_high_u64(lowpart, wide);
-}
-
-int8x16_t test_vqmovn_high_s16(int8x8_t lowpart, int16x8_t wide) {
-  // CHECK: sqxtn2.16b v0, v1
-  return vqmovn_high_s16(lowpart, wide);
-}
-
-int16x8_t test_vqmovn_high_s32(int16x4_t lowpart, int32x4_t wide) {
-  // CHECK: sqxtn2.8h v0, v1
-  return vqmovn_high_s32(lowpart, wide);
-}
-
-int32x4_t test_vqmovn_high_s64(int32x2_t lowpart, int64x2_t wide) {
-  // CHECK: sqxtn2.4s v0, v1
-  return vqmovn_high_s64(lowpart, wide);
-}
-
-uint8x16_t test_vqmovn_high_u16(uint8x8_t lowpart, int16x8_t wide) {
-  // CHECK: uqxtn2.16b v0, v1
-  return vqmovn_high_u16(lowpart, wide);
-}
-
-uint16x8_t test_vqmovn_high_u32(uint16x4_t lowpart, int32x4_t wide) {
-  // CHECK: uqxtn2.8h v0, v1
-  return vqmovn_high_u32(lowpart, wide);
-}
-
-uint32x4_t test_vqmovn_high_u64(uint32x2_t lowpart, int64x2_t wide) {
-  // CHECK: uqxtn2.4s v0, v1
-  return vqmovn_high_u64(lowpart, wide);
-}
-
-uint8x16_t test_vqmovun_high_s16(uint8x8_t lowpart, int16x8_t wide) {
-  // CHECK: sqxtun2.16b v0, v1
-  return vqmovun_high_s16(lowpart, wide);
-}
-
-uint16x8_t test_vqmovun_high_s32(uint16x4_t lowpart, int32x4_t wide) {
-  // CHECK: sqxtun2.8h v0, v1
-  return vqmovun_high_s32(lowpart, wide);
-}
-
-uint32x4_t test_vqmovun_high_s64(uint32x2_t lowpart, int64x2_t wide) {
-  // CHECK: sqxtun2.4s v0, v1
-  return vqmovun_high_s64(lowpart, wide);
-}
-
-float32x4_t test_vcvtx_high_f32_f64(float32x2_t lowpart, float64x2_t wide) {
-  // CHECK: fcvtxn2 v0.4s, v1.2d
-  return vcvtx_high_f32_f64(lowpart, wide);
-}
-
-float64x2_t test_vcvt_f64_f32(float32x2_t x) {
-  // CHECK: fcvtl v0.2d, v0.2s
-  return vcvt_f64_f32(x);
-}
-
-float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
-  // CHECK: fcvtl2 v0.2d, v0.4s
-  return vcvt_high_f64_f32(x);
-}
-
-float32x2_t test_vcvt_f32_f64(float64x2_t v) {
-  // CHECK: fcvtn v0.2s, v0.2d
-  return vcvt_f32_f64(v);
-}
-
-float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: fcvtn2 v0.4s, v1.2d
-  return vcvt_high_f32_f64(x, v);
-}
-
-float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
-  // CHECK: fcvtxn v0.2s, v0.2d
-  return vcvtx_f32_f64(v);
-}
diff --git a/test/CodeGen/arm64_vCMP.c b/test/CodeGen/arm64_vCMP.c
deleted file mode 100644
index a302128af9b82..0000000000000
--- a/test/CodeGen/arm64_vCMP.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-// Test ARM64 SIMD fused multiply add intrinsics
-
-#include <arm_neon.h>
-
-int64x2_t test_vabsq_s64(int64x2_t a1) {
-  // CHECK: test_vabsq_s64
-  return vabsq_s64(a1);
-  // CHECK: llvm.aarch64.neon.abs.v2i64
-  // CHECK-NEXT: ret
-}
-
-int64_t test_vceqd_s64(int64_t a1, int64_t a2) {
-  // CHECK: test_vceqd_s64
-  return vceqd_s64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-int64_t test_vceqd_f64(float64_t a1, float64_t a2) {
-  // CHECK: test_vceqd_f64
-  return vceqd_f64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = fcmp oeq double %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64_t test_vcgtd_u64(uint64_t a1, uint64_t a2) {
-  // CHECK: test_vcgtd_u64
-  return vcgtd_u64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ugt i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64_t test_vcled_u64(uint64_t a1, uint64_t a2) {
-  // CHECK: test_vcled_u64
-  return vcled_u64(a1, a2);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp ule i64 %a1, %a2
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-int64_t test_vceqzd_s64(int64_t a1) {
-  // CHECK: test_vceqzd_s64
-  return vceqzd_s64(a1);
-  // CHECK: [[BIT:%[0-9a-zA-Z.]+]] = icmp eq i64 %a1, 0
-  // CHECK: sext i1 [[BIT]] to i64
-}
-
-uint64x2_t test_vceqq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vceqq_u64
-  return vceqq_u64(a1, a2);
-  // CHECK:  icmp eq <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgeq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcgeq_s64
-  return vcgeq_s64(a1, a2);
-  // CHECK:  icmp sge <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgeq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcgeq_u64
-  return vcgeq_u64(a1, a2);
-  // CHECK:  icmp uge <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgtq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcgtq_s64
-  return vcgtq_s64(a1, a2);
-  // CHECK: icmp sgt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcgtq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcgtq_u64
-  return vcgtq_u64(a1, a2);
-  // CHECK: icmp ugt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcleq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcleq_s64
-  return vcleq_s64(a1, a2);
-  // CHECK: icmp sle <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcleq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcleq_u64
-  return vcleq_u64(a1, a2);
-  // CHECK: icmp ule <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcltq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vcltq_s64
-  return vcltq_s64(a1, a2);
-  // CHECK: icmp slt <2 x i64> %a1, %a2
-}
-
-uint64x2_t test_vcltq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vcltq_u64
-  return vcltq_u64(a1, a2);
-  // CHECK: icmp ult <2 x i64> %a1, %a2
-}
-
-int64x2_t test_vqabsq_s64(int64x2_t a1) {
-  // CHECK: test_vqabsq_s64
-  return vqabsq_s64(a1);
-  // CHECK: llvm.aarch64.neon.sqabs.v2i64(<2 x i64> %a1)
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vLdStNum_lane.c b/test/CodeGen/arm64_vLdStNum_lane.c
deleted file mode 100644
index 85229d5a57f6d..0000000000000
--- a/test/CodeGen/arm64_vLdStNum_lane.c
+++ /dev/null
@@ -1,141 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD load and stores of an N-element structure  intrinsics
-
-#include <arm_neon.h>
-
-int64x2x2_t test_vld2q_lane_s64(const void * a1, int64x2x2_t a2) {
-  // CHECK: test_vld2q_lane_s64
-  return vld2q_lane_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld2lane.v2i64.p0i8
-}
-
-uint64x2x2_t test_vld2q_lane_u64(const void * a1, uint64x2x2_t a2) {
-  // CHECK: test_vld2q_lane_u64
-  return vld2q_lane_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld2lane.v2i64.p0i8
-}
-
-int64x1x2_t test_vld2_lane_s64(const void * a1, int64x1x2_t a2) {
-  // CHECK: test_vld2_lane_s64
-  return vld2_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v1i64.p0i8
-}
-
-uint64x1x2_t test_vld2_lane_u64(const void * a1, uint64x1x2_t a2) {
-  // CHECK: test_vld2_lane_u64
-  return vld2_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v1i64.p0i8
-}
-
-poly8x16x2_t test_vld2q_lane_p8(const void * a1, poly8x16x2_t a2) {
-  // CHECK: test_vld2q_lane_p8
-  return vld2q_lane_p8(a1, a2, 0);
-  // CHECK: extractvalue {{.*}} 0{{ *$}}
-  // CHECK: extractvalue {{.*}} 1{{ *$}}
-}
-
-uint8x16x2_t test_vld2q_lane_u8(const void * a1, uint8x16x2_t a2) {
-  // CHECK: test_vld2q_lane_u8
-  return vld2q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld2lane.v16i8.p0i8
-}
-
-int64x2x3_t test_vld3q_lane_s64(const void * a1, int64x2x3_t a2) {
-  // CHECK: test_vld3q_lane_s64
-  return vld3q_lane_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld3lane.v2i64.p0i8
-}
-
-uint64x2x3_t test_vld3q_lane_u64(const void * a1, uint64x2x3_t a2) {
-  // CHECK: test_vld3q_lane_u64
-  return vld3q_lane_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.ld3lane.v2i64.p0i8
-}
-
-int64x1x3_t test_vld3_lane_s64(const void * a1, int64x1x3_t a2) {
-  // CHECK: test_vld3_lane_s64
-  return vld3_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v1i64.p0i8
-}
-
-uint64x1x3_t test_vld3_lane_u64(const void * a1, uint64x1x3_t a2) {
-  // CHECK: test_vld3_lane_u64
-  return vld3_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v1i64.p0i8
-}
-
-int8x8x3_t test_vld3_lane_s8(const void * a1, int8x8x3_t a2) {
-  // CHECK: test_vld3_lane_s8
-  return vld3_lane_s8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v8i8.p0i8
-}
-
-poly8x16x3_t test_vld3q_lane_p8(const void * a1, poly8x16x3_t a2) {
-  // CHECK: test_vld3q_lane_p8
-  return vld3q_lane_p8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v16i8.p0i8
-}
-
-uint8x16x3_t test_vld3q_lane_u8(const void * a1, uint8x16x3_t a2) {
-  // CHECK: test_vld3q_lane_u8
-  return vld3q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld3lane.v16i8.p0i8
-}
-
-int64x2x4_t test_vld4q_lane_s64(const void * a1, int64x2x4_t a2) {
-  // CHECK: test_vld4q_lane_s64
-  return vld4q_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v2i64.p0i8
-}
-
-uint64x2x4_t test_vld4q_lane_u64(const void * a1, uint64x2x4_t a2) {
-  // CHECK: test_vld4q_lane_u64
-  return vld4q_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v2i64.p0i8
-}
-
-int64x1x4_t test_vld4_lane_s64(const void * a1, int64x1x4_t a2) {
-  // CHECK: test_vld4_lane_s64
-  return vld4_lane_s64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v1i64.p0i8
-}
-
-uint64x1x4_t test_vld4_lane_u64(const void * a1, uint64x1x4_t a2) {
-  // CHECK: test_vld4_lane_u64
-  return vld4_lane_u64(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v1i64.p0i8
-}
-
-int8x8x4_t test_vld4_lane_s8(const void * a1, int8x8x4_t a2) {
-  // CHECK: test_vld4_lane_s8
-  return vld4_lane_s8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v8i8.p0i8
-}
-
-uint8x8x4_t test_vld4_lane_u8(const void * a1, uint8x8x4_t a2) {
-  // CHECK: test_vld4_lane_u8
-  return vld4_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v8i8.p0i8
-}
-
-poly8x16x4_t test_vld4q_lane_p8(const void * a1, poly8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_p8
-  return vld4q_lane_p8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v16i8.p0i8
-}
-
-int8x16x4_t test_vld4q_lane_s8(const void * a1, int8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_s8
-  return vld4q_lane_s8(a1, a2, 0);
-  // CHECK: extractvalue {{.*}} 0{{ *$}}
-  // CHECK: extractvalue {{.*}} 1{{ *$}}
-  // CHECK: extractvalue {{.*}} 2{{ *$}}
-  // CHECK: extractvalue {{.*}} 3{{ *$}}
-}
-
-uint8x16x4_t test_vld4q_lane_u8(const void * a1, uint8x16x4_t a2) {
-  // CHECK: test_vld4q_lane_u8
-  return vld4q_lane_u8(a1, a2, 0);
-  // CHECK: llvm.aarch64.neon.ld4lane.v16i8.p0i8
-}
-
diff --git a/test/CodeGen/arm64_vMaxMin.c b/test/CodeGen/arm64_vMaxMin.c
deleted file mode 100644
index a1dd2adb255b1..0000000000000
--- a/test/CodeGen/arm64_vMaxMin.c
+++ /dev/null
@@ -1,207 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck -check-prefix=CHECK-CODEGEN %s
-// REQUIRES: aarch64-registered-target
-// Test ARM64 SIMD max/min intrinsics
-
-#include <arm_neon.h>
-
-// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit reduction
-int8_t test_vmaxv_s8(int8x8_t a1) {
-  // CHECK-LABEL: define i8 @test_vmaxv_s8(
-  return vmaxv_s8(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.smaxv.i32.v8i8(
-}
-
-uint16_t test_vminvq_u16(uint16x8_t a1) {
-  // CHECK-LABEL: define i16 @test_vminvq_u16(
-  return vminvq_u16(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.uminv.i32.v8i16(
-}
-
-// Test a represntative sample of 8 and 16, signed and unsigned, 64 and 128 bit pairwise
-uint8x8_t test_vmin_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK-LABEL: define <8 x i8> @test_vmin_u8(
-  return vmin_u8(a1, a2);
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.umin.v8i8(
-}
-
-uint8x16_t test_vminq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: define <16 x i8> @test_vminq_u8(
-  return vminq_u8(a1, a2);
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.umin.v16i8(
-}
-
-int16x8_t test_vmaxq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(
-  return vmaxq_s16(a1, a2);
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.smax.v8i16(
-}
-
-// Test the more complicated cases of [suf]32 and f64
-float64x2_t test_vmaxq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: define <2 x double> @test_vmaxq_f64(
-  return vmaxq_f64(a1, a2);
-  // CHECK: call <2 x double> @llvm.aarch64.neon.fmax.v2f64(
-}
-
-float32x4_t test_vmaxq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: define <4 x float> @test_vmaxq_f32(
-  return vmaxq_f32(a1, a2);
-  // CHECK: call <4 x float> @llvm.aarch64.neon.fmax.v4f32(
-}
-
-float64x2_t test_vminq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: define <2 x double> @test_vminq_f64(
-  return vminq_f64(a1, a2);
-  // CHECK: call <2 x double> @llvm.aarch64.neon.fmin.v2f64(
-}
-
-float32x2_t test_vmax_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK-LABEL: define <2 x float> @test_vmax_f32(
-  return vmax_f32(a1, a2);
-  // CHECK: call <2 x float> @llvm.aarch64.neon.fmax.v2f32(
-}
-
-int32x2_t test_vmax_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK-LABEL: define <2 x i32> @test_vmax_s32(
-  return vmax_s32(a1, a2);
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.smax.v2i32(
-}
-
-uint32x2_t test_vmin_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK-LABEL: define <2 x i32> @test_vmin_u32(
-  return vmin_u32(a1, a2);
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.umin.v2i32(
-}
-
-float32_t test_vmaxnmv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vmaxnmv_f32(
-  return vmaxnmv_f32(a1);
-  // CHECK: llvm.aarch64.neon.fmaxnmv.f32.v2f32
-  // CHECK-NEXT: ret
-}
-
-// this doesn't translate into a valid instruction, regardless of what the
-// ARM doc says.
-#if 0
-float64_t test_vmaxnmvq_f64(float64x2_t a1) {
-  // CHECK@ test_vmaxnmvq_f64
-  return vmaxnmvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vmaxnmvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vmaxnmvq_f32(
-  return vmaxnmvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxnmv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vmaxv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vmaxv_f32(
-  return vmaxv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxv.f32.v2f32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: fmaxp.2s
-  // CHECK-NEXT: ret
-}
-
-int32_t test_vmaxv_s32(int32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vmaxv_s32(
-  return vmaxv_s32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.smaxv.i32.v2i32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: smaxp.2s
-  // CHECK-NEXT: ret
-}
-
-uint32_t test_vmaxv_u32(uint32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vmaxv_u32(
-  return vmaxv_u32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.umaxv.i32.v2i32(
-  // FIXME check that the 2nd and 3rd arguments are the same V register below
-  // CHECK-CODEGEN: umaxp.2s
-  // CHECK-NEXT: ret
-}
-
-// FIXME punt on this for now; don't forget to fix CHECKs
-#if 0
-float64_t test_vmaxvq_f64(float64x2_t a1) {
-  // CHECK@ test_vmaxvq_f64
-  return vmaxvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.fmaxv.i64.v2f64
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vmaxvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vmaxvq_f32(
-  return vmaxvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fmaxv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminnmv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vminnmv_f32(
-  return vminnmv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminnmv.f32.v2f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vminvq_f32(
-  return vminvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-// this doesn't translate into a valid instruction, regardless of what the ARM
-// doc says.
-#if 0
-float64_t test_vminnmvq_f64(float64x2_t a1) {
-  // CHECK@ test_vminnmvq_f64
-  return vminnmvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
-
-float32_t test_vminnmvq_f32(float32x4_t a1) {
-  // CHECK-LABEL: define float @test_vminnmvq_f32(
-  return vminnmvq_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminnmv.f32.v4f32(
-  // CHECK-NEXT: ret
-}
-
-float32_t test_vminv_f32(float32x2_t a1) {
-  // CHECK-LABEL: define float @test_vminv_f32(
-  return vminv_f32(a1);
-  // CHECK: call float @llvm.aarch64.neon.fminv.f32.v2f32(
-  // CHECK-NEXT: ret
-}
-
-int32_t test_vminv_s32(int32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vminv_s32(
-  return vminv_s32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.sminv.i32.v2i32(
-  // CHECK-CODEGEN: sminp.2s
-  // CHECK-NEXT: ret
-}
-
-uint32_t test_vminv_u32(uint32x2_t a1) {
-  // CHECK-LABEL: define i32 @test_vminv_u32(
-  return vminv_u32(a1);
-  // CHECK: call i32 @llvm.aarch64.neon.uminv.i32.v2i32(
-}
-
-// FIXME punt on this for now; don't forget to fix CHECKs
-#if 0
-float64_t test_vminvq_f64(float64x2_t a1) {
-  // CHECK@ test_vminvq_f64
-  return vminvq_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-#endif
diff --git a/test/CodeGen/arm64_vadd.c b/test/CodeGen/arm64_vadd.c
deleted file mode 100644
index 7b2913f1a391b..0000000000000
--- a/test/CodeGen/arm64_vadd.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD add intrinsics
-
-#include <arm_neon.h>
-int64_t test_vaddlv_s32(int32x2_t a1) {
-  // CHECK: test_vaddlv_s32
-  return vaddlv_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT: ret
-}
-
-uint64_t test_vaddlv_u32(uint32x2_t a1) {
-  // CHECK: test_vaddlv_u32
-  return vaddlv_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddlv.i64.v2i32
-  // CHECK-NEXT: ret
-}
-
-int8_t test_vaddv_s8(int8x8_t a1) {
-  // CHECK: test_vaddv_s8
-  return vaddv_s8(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v8i8
-  // don't check for return here (there's a trunc?)
-}
-
-int16_t test_vaddv_s16(int16x4_t a1) {
-  // CHECK: test_vaddv_s16
-  return vaddv_s16(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v4i16
-  // don't check for return here (there's a trunc?)
-}
-
-int32_t test_vaddv_s32(int32x2_t a1) {
-  // CHECK: test_vaddv_s32
-  return vaddv_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v2i32
-  // CHECK-NEXT: ret
-}
-
-uint8_t test_vaddv_u8(int8x8_t a1) {
-  // CHECK: test_vaddv_u8
-  return vaddv_u8(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v8i8
-  // don't check for return here (there's a trunc?)
-}
-
-uint16_t test_vaddv_u16(int16x4_t a1) {
-  // CHECK: test_vaddv_u16
-  return vaddv_u16(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v4i16
-  // don't check for return here (there's a trunc?)
-}
-
-uint32_t test_vaddv_u32(int32x2_t a1) {
-  // CHECK: test_vaddv_u32
-  return vaddv_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v2i32
-  // CHECK-NEXT: ret
-}
-
-int8_t test_vaddvq_s8(int8x16_t a1) {
-  // CHECK: test_vaddvq_s8
-  return vaddvq_s8(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v16i8
-  // don't check for return here (there's a trunc?)
-}
-
-int16_t test_vaddvq_s16(int16x8_t a1) {
-  // CHECK: test_vaddvq_s16
-  return vaddvq_s16(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v8i16
-  // don't check for return here (there's a trunc?)
-}
-
-int32_t test_vaddvq_s32(int32x4_t a1) {
-  // CHECK: test_vaddvq_s32
-  return vaddvq_s32(a1);
-  // CHECK: llvm.aarch64.neon.saddv.i32.v4i32
-  // CHECK-NEXT: ret
-}
-
-uint8_t test_vaddvq_u8(int8x16_t a1) {
-  // CHECK: test_vaddvq_u8
-  return vaddvq_u8(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v16i8
-  // don't check for return here (there's a trunc?)
-}
-
-uint16_t test_vaddvq_u16(int16x8_t a1) {
-  // CHECK: test_vaddvq_u16
-  return vaddvq_u16(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v8i16
-  // don't check for return here (there's a trunc?)
-}
-
-uint32_t test_vaddvq_u32(int32x4_t a1) {
-  // CHECK: test_vaddvq_u32
-  return vaddvq_u32(a1);
-  // CHECK: llvm.aarch64.neon.uaddv.i32.v4i32
-  // CHECK-NEXT: ret
-}
-
diff --git a/test/CodeGen/arm64_vca.c b/test/CodeGen/arm64_vca.c
deleted file mode 100644
index 00cc283063b8c..0000000000000
--- a/test/CodeGen/arm64_vca.c
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 vector compare absolute intrinsics
-
-#include <arm_neon.h>
-
-uint32x2_t test_vcale_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK: test_vcale_f32
-  return vcale_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i32.v2f32
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x4_t test_vcaleq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK: test_vcaleq_f32
-  return vcaleq_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v4i32.v4f32{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x2_t test_vcalt_f32(float32x2_t a1, float32x2_t a2) {
-  // CHECK: test_vcalt_f32
-  return vcalt_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i32.v2f32{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint32x4_t test_vcaltq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK: test_vcaltq_f32
-  return vcaltq_f32(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v4i32.v4f32{{.*a2,.*a1}}
-}
-
-uint64x2_t test_vcagtq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcagtq_f64
-  return vcagtq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i64.v2f64{{.*a1,.*a2}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcaltq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcaltq_f64
-  return vcaltq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facgt.v2i64.v2f64{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcageq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcageq_f64
-  return vcageq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i64.v2f64{{.*a1,.*a2}}
-  // no check for ret here, as there is a bitcast
-}
-
-uint64x2_t test_vcaleq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK: test_vcaleq_f64
-  return vcaleq_f64(a1, a2);
-  // CHECK: llvm.aarch64.neon.facge.v2i64.v2f64{{.*a2,.*a1}}
-  // no check for ret here, as there is a bitcast
-}
diff --git a/test/CodeGen/arm64_vcopy.c b/test/CodeGen/arm64_vcopy.c
index 990d4f658cbc6..4c0143016fbe8 100644
--- a/test/CodeGen/arm64_vcopy.c
+++ b/test/CodeGen/arm64_vcopy.c
@@ -1,69 +1,121 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 // Test ARM64 SIMD copy vector element to vector element: vcopyq_lane*
 
 #include <arm_neon.h>
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_s8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vcopyq_laneq_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s8
   return vcopyq_laneq_s8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 }
 
+// CHECK-LABEL: define <16 x i8> @test_vcopyq_laneq_u8(<16 x i8> %a1, <16 x i8> %a2) #0 {
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <16 x i8> %a2, i32 13
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %a1, i8 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vcopyq_laneq_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u8
   return vcopyq_laneq_u8(a1, (int64_t) 3, a2, (int64_t) 13);
-  // CHECK: shufflevector <16 x i8> %a1, <16 x i8> %a2, <16 x i32> <i32 0, i32 1, i32 2, i32 29, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_s16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vcopyq_laneq_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s16
   return vcopyq_laneq_s16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <8 x i16> @test_vcopyq_laneq_u16(<8 x i16> %a1, <8 x i16> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vcopyq_laneq_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u16
   return vcopyq_laneq_u16(a1, (int64_t) 3, a2, (int64_t) 7);
-  // CHECK: shufflevector <8 x i16> %a1, <8 x i16> %a2, <8 x i32> <i32 0, i32 1, i32 2, i32 15, i32 4, i32 5, i32 6, i32 7>
 
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_s32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vcopyq_laneq_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s32
   return vcopyq_laneq_s32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <4 x i32> @test_vcopyq_laneq_u32(<4 x i32> %a1, <4 x i32> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[VGETQ_LANE]], i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vcopyq_laneq_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u32
   return vcopyq_laneq_u32(a1, (int64_t) 3, a2, (int64_t) 3);
-  // CHECK: shufflevector <4 x i32> %a1, <4 x i32> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_s64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vcopyq_laneq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_s64
   return vcopyq_laneq_s64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <2 x i64> @test_vcopyq_laneq_u64(<2 x i64> %a1, <2 x i64> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vcopyq_laneq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_u64
   return vcopyq_laneq_u64(a1, (int64_t) 0, a2, (int64_t) 1);
-  // CHECK: shufflevector <2 x i64> %a1, <2 x i64> %a2, <2 x i32> <i32 3, i32 1>
 }
 
+// CHECK-LABEL: define <4 x float> @test_vcopyq_laneq_f32(<4 x float> %a1, <4 x float> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP3]], float [[VGETQ_LANE]], i32 0
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vcopyq_laneq_f32(float32x4_t a1, float32x4_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f32
   return vcopyq_laneq_f32(a1, 0, a2, 3);
-  // CHECK: shufflevector <4 x float> %a1, <4 x float> %a2, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
 }
 
+// CHECK-LABEL: define <2 x double> @test_vcopyq_laneq_f64(<2 x double> %a1, <2 x double> %a2) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a2 to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double>
+// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x double> %a1 to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x double>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x double> [[TMP3]], double [[VGETQ_LANE]], i32 0
+// CHECK:   ret <2 x double> [[VSET_LANE]]
 float64x2_t test_vcopyq_laneq_f64(float64x2_t a1, float64x2_t a2) {
-  // CHECK-LABEL: test_vcopyq_laneq_f64
   return vcopyq_laneq_f64(a1, 0, a2, 1);
-  // CHECK: shufflevector <2 x double> %a1, <2 x double> %a2, <2 x i32> <i32 3, i32 1>
 }
 
diff --git a/test/CodeGen/arm64_vcreate.c b/test/CodeGen/arm64_vcreate.c
index b9747525340ae..ddfa147705d50 100644
--- a/test/CodeGen/arm64_vcreate.c
+++ b/test/CodeGen/arm64_vcreate.c
@@ -1,7 +1,6 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 // Test ARM64 SIMD vcreate intrinsics
 
-/*#include <arm_neon.h>*/
 #include <arm_neon.h>
 
 float32x2_t test_vcreate_f32(uint64_t a1) {
@@ -10,14 +9,3 @@ float32x2_t test_vcreate_f32(uint64_t a1) {
   // CHECK: bitcast {{.*}} to <2 x float>
   // CHECK-NEXT: ret
 }
-
-// FIXME enable when scalar_to_vector in backend is fixed.  Also, change
-// CHECK@ to CHECK<colon> and CHECK-NEXT@ to CHECK-NEXT<colon>
-/*
-float64x1_t test_vcreate_f64(uint64_t a1) {
-  // CHECK@ test_vcreate_f64
-  return vcreate_f64(a1);
-  // CHECK@ llvm.aarch64.neon.saddlv.i64.v2i32
-  // CHECK-NEXT@ ret
-}
-*/
diff --git a/test/CodeGen/arm64_vcvtfp.c b/test/CodeGen/arm64_vcvtfp.c
deleted file mode 100644
index e3dca8159931e..0000000000000
--- a/test/CodeGen/arm64_vcvtfp.c
+++ /dev/null
@@ -1,48 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-#include <arm_neon.h>
-
-float64x2_t test_vcvt_f64_f32(float32x2_t x) {
-  // CHECK-LABEL: test_vcvt_f64_f32
-  return vcvt_f64_f32(x);
-  // CHECK: fpext <2 x float> {{%.*}} to <2 x double>
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vcvt_high_f64_f32(float32x4_t x) {
-  // CHECK-LABEL: test_vcvt_high_f64_f32
-  return vcvt_high_f64_f32(x);
-  // CHECK: [[HIGH:%.*]] = shufflevector <4 x float> {{%.*}}, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-  // CHECK-NEXT: fpext <2 x float> [[HIGH]] to <2 x double>
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vcvt_f32_f64(float64x2_t v) {
-  // CHECK: test_vcvt_f32_f64
-  return vcvt_f32_f64(v);
-  // CHECK: fptrunc <2 x double> {{%.*}} to <2 x float>
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vcvt_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: test_vcvt_high_f32_f64
-  return vcvt_high_f32_f64(x, v);
-  // CHECK: [[TRUNC:%.*]] = fptrunc <2 x double> {{.*}} to <2 x float>
-  // CHECK-NEXT: shufflevector <2 x float> {{.*}}, <2 x float> [[TRUNC]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vcvtx_f32_f64(float64x2_t v) {
-  // CHECK: test_vcvtx_f32_f64
-  return vcvtx_f32_f64(v);
-  // CHECK: llvm.aarch64.neon.fcvtxn.v2f32.v2f64
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vcvtx_high_f32_f64(float32x2_t x, float64x2_t v) {
-  // CHECK: test_vcvtx_high_f32_f64
-  return vcvtx_high_f32_f64(x, v);
-  // CHECK: llvm.aarch64.neon.fcvtxn.v2f32.v2f64
-  // CHECK: shufflevector
-  // CHECK: ret
-}
diff --git a/test/CodeGen/arm64_vdupq_n_f64.c b/test/CodeGen/arm64_vdupq_n_f64.c
index ffba55cf8f560..58cc7f020e916 100644
--- a/test/CodeGen/arm64_vdupq_n_f64.c
+++ b/test/CodeGen/arm64_vdupq_n_f64.c
@@ -1,88 +1,78 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | \
-// RUN:   FileCheck -check-prefix=CHECK-IR %s
-// REQUIRES: aarch64-registered-target
-
-/// Test vdupq_n_f64 and vmovq_nf64 ARM64 intrinsics
-// <rdar://problem/11778405> ARM64: vdupq_n_f64 and vdupq_lane_f64 intrinsics
-// missing
-
+// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -fallow-half-arguments-and-returns -S -o - -emit-llvm %s | opt -S -mem2reg | FileCheck %s
 
 #include <arm_neon.h>
 
 // vdupq_n_f64 -> dup.2d v0, v0[0]
 //
-float64x2_t test_vdupq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vdupq_n_f64(float64_t w) {
     return vdupq_n_f64(w);
-  // CHECK-LABEL: test_vdupq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // might as well test this while we're here
 // vdupq_n_f32 -> dup.4s v0, v0[0]
-float32x4_t test_vdupq_n_f32(float32_t w)
-{
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %w, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %w, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %w, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
+float32x4_t test_vdupq_n_f32(float32_t w) {
     return vdupq_n_f32(w);
-  // CHECK-LABEL: test_vdupq_n_f32:
-  // CHECK: dup.4s v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vdupq_lane_f64 -> dup.2d v0, v0[0]
 // this was in <rdar://problem/11778405>, but had already been implemented,
 // test anyway
-float64x2_t test_vdupq_lane_f64(float64x1_t V)
-{
+// CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x double> [[SHUFFLE]]
+float64x2_t test_vdupq_lane_f64(float64x1_t V) {
     return vdupq_lane_f64(V, 0);
-  // CHECK-LABEL: test_vdupq_lane_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
 // vmovq_n_f64 -> dup Vd.2d,X0
 // this wasn't in <rdar://problem/11778405>, but it was between the vdups
-float64x2_t test_vmovq_n_f64(float64_t w)
-{
+// CHECK-LABEL: define <2 x double> @test_vmovq_n_f64(double %w) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x double> undef, double %w, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x double> [[VECINIT_I]], double %w, i32 1
+// CHECK:   ret <2 x double> [[VECINIT1_I]]
+float64x2_t test_vmovq_n_f64(float64_t w) {
   return vmovq_n_f64(w);
-  // CHECK-LABEL: test_vmovq_n_f64:
-  // CHECK: dup.2d v0, v0[0]
-  // CHECK-NEXT: ret
 }
 
-float16x4_t test_vmov_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmov_n_f16
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
+float16x4_t test_vmov_n_f16(float16_t *a1) {
   return vmov_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
 }
 
-// Disable until scalar problem in backend is fixed. Change CHECK-IR@ to
-// CHECK-IR<colon>
 /*
-float64x1_t test_vmov_n_f64(float64_t a1)
-{
-  // CHECK-IR@ test_vmov_n_f64
+float64x1_t test_vmov_n_f64(float64_t a1) {
   return vmov_n_f64(a1);
-  // CHECK-IR@ insertelement {{.*}} i32 0{{ *$}}
 }
 */
 
-float16x8_t test_vmovq_n_f16(float16_t *a1)
-{
-  // CHECK-IR-LABEL: test_vmovq_n_f16
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a1) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a1, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
+float16x8_t test_vmovq_n_f16(float16_t *a1) {
   return vmovq_n_f16(*a1);
-  // CHECK-IR: insertelement {{.*}} i32 0{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 1{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 2{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 3{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 4{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 5{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 6{{ *$}}
-  // CHECK-IR: insertelement {{.*}} i32 7{{ *$}}
 }
 
diff --git a/test/CodeGen/arm64_vecCmpBr.c b/test/CodeGen/arm64_vecCmpBr.c
deleted file mode 100644
index 3ae7433baed19..0000000000000
--- a/test/CodeGen/arm64_vecCmpBr.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -S -ffreestanding %s -o - -target-cpu cyclone | FileCheck %s
-// REQUIRES: aarch64-registered-target
-// test code generation for <rdar://problem/11487757>
-#include <arm_neon.h>
-
-unsigned bar();
-
-// Branch if any lane of V0 is zero; 64 bit => !min
-unsigned anyZero64(uint16x4_t a) {
-// CHECK: anyZero64:
-// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vminv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is zero; 128 bit => !min
-unsigned anyZero128(uint16x8_t a) {
-// CHECK: anyZero128:
-// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vminvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is non-zero; 64 bit => max
-unsigned anyNonZero64(uint16x4_t a) {
-// CHECK: anyNonZero64:
-// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vmaxv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if any lane of V0 is non-zero; 128 bit => max
-unsigned anyNonZero128(uint16x8_t a) {
-// CHECK: anyNonZero128:
-// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vmaxvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are zero; 64 bit => !max
-unsigned allZero64(uint16x4_t a) {
-// CHECK: allZero64:
-// CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vmaxv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are zero; 128 bit => !max
-unsigned allZero128(uint16x8_t a) {
-// CHECK: allZero128:
-// CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: b {{_bar|bar}}
-  if (!vmaxvq_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are non-zero; 64 bit => min
-unsigned allNonZero64(uint16x4_t a) {
-// CHECK: allNonZero64:
-// CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vminv_u8(a))
-    return bar();
-  return 0;
-}
-
-// Branch if all lanes of V0 are non-zero; 128 bit => min
-unsigned allNonZero128(uint16x8_t a) {
-// CHECK: allNonZero128:
-// CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
-// CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
-// CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[.A-Z_0-9]+]]
-// CHECK: [[LABEL]]:
-// CHECK-NEXT: movz w0, #0
-  if (vminvq_u8(a))
-    return bar();
-  return 0;
-}
-
diff --git a/test/CodeGen/arm64_vext.c b/test/CodeGen/arm64_vext.c
deleted file mode 100644
index 6c3fe73399a1a..0000000000000
--- a/test/CodeGen/arm64_vext.c
+++ /dev/null
@@ -1,239 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-
-// Test ARM64 extract intrinsics
-// can use as back end test by adding a run line with
-// -check-prefix=CHECK-CODEGEN on the FileCheck
-
-#include <arm_neon.h>
-
-void test_vext_s8()
-{
-  // CHECK: test_vext_s8
-  int8x8_t xS8x8;
-  xS8x8 = vext_s8(xS8x8, xS8x8, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s8:
-  // CHECK-CODEGEN: {{ext.8.*#1}}
-}
-
-void test_vext_u8()
-{
-  // CHECK: test_vext_u8
-  uint8x8_t xU8x8;
-  xU8x8 = vext_u8(xU8x8, xU8x8, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u8:
-  // CHECK-CODEGEN: {{ext.8.*#2}}
-}
-
-void test_vext_p8()
-{
-  // CHECK: test_vext_p8
-  poly8x8_t xP8x8;
-  xP8x8 = vext_p8(xP8x8, xP8x8, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_p8:
-  // CHECK-CODEGEN: {{ext.8.*#3}}
-}
-
-void test_vext_s16()
-{
-  // CHECK: test_vext_s16
-  int16x4_t xS16x4;
-  xS16x4 = vext_s16(xS16x4, xS16x4, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s16:
-  // CHECK-CODEGEN: {{ext.8.*#2}}
-}
-
-void test_vext_u16()
-{
-  // CHECK: test_vext_u16
-  uint16x4_t xU16x4;
-  xU16x4 = vext_u16(xU16x4, xU16x4, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u16:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_p16()
-{
-  // CHECK: test_vext_p16
-  poly16x4_t xP16x4;
-  xP16x4 = vext_p16(xP16x4, xP16x4, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_p16:
-  // CHECK-CODEGEN: {{ext.8.*#6}}
-}
-
-void test_vext_s32()
-{
-  // CHECK: test_vext_s32
-  int32x2_t xS32x2;
-  xS32x2 = vext_s32(xS32x2, xS32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_u32()
-{
-  // CHECK: test_vext_u32
-  uint32x2_t xU32x2;
-  xU32x2 = vext_u32(xU32x2, xU32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_f32()
-{
-  // CHECK: test_vext_f32
-  float32x2_t xF32x2;
-  xF32x2 = vext_f32(xF32x2, xF32x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_f32:
-  // CHECK-CODEGEN: {{ext.8.*#4}}
-}
-
-void test_vext_s64()
-{
-  // CHECK: test_vext_s64
-  int64x1_t xS64x1;
-  // FIXME don't use 1 as index or check for now, clang has a bug?
-  xS64x1 = vext_s64(xS64x1, xS64x1, /*1*/0);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_s64:
-  // CHECK_FIXME: {{ext.8.*#0}}
-}
-
-void test_vext_u64()
-{
-  // CHECK: test_vext_u64
-  uint64x1_t xU64x1;
-  // FIXME don't use 1 as index or check for now, clang has a bug?
-  xU64x1 = vext_u64(xU64x1, xU64x1, /*1*/0);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vext_u64:
-  // CHECK_FIXME: {{ext.8.*#0}}
-}
-
-void test_vextq_s8()
-{
-  // CHECK: test_vextq_s8
-  int8x16_t xS8x16;
-  xS8x16 = vextq_s8(xS8x16, xS8x16, 4);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s8:
-  // CHECK-CODEGEN: {{ext.16.*#4}}
-}
-
-void test_vextq_u8()
-{
-  // CHECK: test_vextq_u8
-  uint8x16_t xU8x16;
-  xU8x16 = vextq_u8(xU8x16, xU8x16, 5);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u8:
-  // CHECK-CODEGEN: {{ext.16.*#5}}
-}
-
-void test_vextq_p8()
-{
-  // CHECK: test_vextq_p8
-  poly8x16_t xP8x16;
-  xP8x16 = vextq_p8(xP8x16, xP8x16, 6);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_p8:
-  // CHECK-CODEGEN: {{ext.16.*#6}}
-}
-
-void test_vextq_s16()
-{
-  // CHECK: test_vextq_s16
-  int16x8_t xS16x8;
-  xS16x8 = vextq_s16(xS16x8, xS16x8, 7);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s16:
-  // CHECK-CODEGEN: {{ext.16.*#14}}
-}
-
-void test_vextq_u16()
-{
-  // CHECK: test_vextq_u16
-  uint16x8_t xU16x8;
-  xU16x8 = vextq_u16(xU16x8, xU16x8, 4);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u16:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_p16()
-{
-  // CHECK: test_vextq_p16
-  poly16x8_t xP16x8;
-  xP16x8 = vextq_p16(xP16x8, xP16x8, 5);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_p16:
-  // CHECK-CODEGEN: {{ext.16.*#10}}
-}
-
-void test_vextq_s32()
-{
-  // CHECK: test_vextq_s32
-  int32x4_t xS32x4;
-  xS32x4 = vextq_s32(xS32x4, xS32x4, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s32:
-  // CHECK-CODEGEN: {{ext.16.*#4}}
-}
-
-void test_vextq_u32()
-{
-  // CHECK: test_vextq_u32
-  uint32x4_t xU32x4;
-  xU32x4 = vextq_u32(xU32x4, xU32x4, 2);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u32:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_f32()
-{
-  // CHECK: test_vextq_f32
-  float32x4_t xF32x4;
-  xF32x4 = vextq_f32(xF32x4, xF32x4, 3);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_f32:
-  // CHECK-CODEGEN: {{ext.16.*#12}}
-}
-
-void test_vextq_s64()
-{
-  // CHECK: test_vextq_s64
-  int64x2_t xS64x2;
-  xS64x2 = vextq_s64(xS64x2, xS64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_s64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_u64()
-{
-  // CHECK: test_vextq_u64
-  uint64x2_t xU64x2;
-  xU64x2 = vextq_u64(xU64x2, xU64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
-
-void test_vextq_f64()
-{
-  // CHECK: test_vextq_f64
-  float64x2_t xF64x2;
-  xF64x2 = vextq_f64(xF64x2, xF64x2, 1);
-  // CHECK: shufflevector
-  // CHECK-CODEGEN: test_vextq_u64:
-  // CHECK-CODEGEN: {{ext.16.*#8}}
-}
diff --git a/test/CodeGen/arm64_vfma.c b/test/CodeGen/arm64_vfma.c
deleted file mode 100644
index bfa568779638e..0000000000000
--- a/test/CodeGen/arm64_vfma.c
+++ /dev/null
@@ -1,136 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD fused multiply add intrinsics
-
-#include <arm_neon.h>
-
-float32x2_t test_vfma_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfma_f32
-  return vfma_f32(a1, a2, a3);
-  // CHECK: llvm.fma.v2f32({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
-  // CHECK: test_vfmaq_f32
-  return vfmaq_f32(a1, a2, a3);
-  // CHECK: llvm.fma.v4f32({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
-  // CHECK: test_vfmaq_f64
-  return vfmaq_f64(a1, a2, a3);
-  // CHECK: llvm.fma.v2f64({{.*a2, .*a3, .*a1}})
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfma_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfma_lane_f32
-  return vfma_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v2f32(<2 x float> %a2, <2 x float> {{.*}}, <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
-  // CHECK: test_vfmaq_lane_f32
-  return vfmaq_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v4f32(<4 x float> %a2, <4 x float> {{.*}}, <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
-  // CHECK: test_vfmaq_lane_f64
-  return vfmaq_lane_f64(a1, a2, a3, 0);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: llvm.fma.v2f64(<2 x double> %a2, <2 x double> {{.*}}, <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfma_n_f32(float32x2_t a1, float32x2_t a2, float32_t a3) {
-  // CHECK: test_vfma_n_f32
-  return vfma_n_f32(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually two insertelements)
-  // CHECK: llvm.fma.v2f32
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmaq_n_f32(float32x4_t a1, float32x4_t a2, float32_t a3) {
-  // CHECK: test_vfmaq_n_f32
-  return vfmaq_n_f32(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually four insertelements)
-  // CHECK: llvm.fma.v4f32
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmaq_n_f64(float64x2_t a1, float64x2_t a2, float64_t a3) {
-  // CHECK: test_vfmaq_n_f64
-  return vfmaq_n_f64(a1, a2, a3);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 0 (usually two insertelements)
-  // CHECK: llvm.fma.v2f64
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfms_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfms_f32
-  return vfms_f32(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f32(<2 x float> %a3, <2 x float> [[NEG]], <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmsq_f32(float32x4_t a1, float32x4_t a2, float32x4_t a3) {
-  // CHECK: test_vfmsq_f32
-  return vfmsq_f32(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <4 x float> {{.*}}, %a2
-  // CHECK: llvm.fma.v4f32(<4 x float> %a3, <4 x float> [[NEG]], <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmsq_f64(float64x2_t a1, float64x2_t a2, float64x2_t a3) {
-  // CHECK: test_vfmsq_f64
-  return vfmsq_f64(a1, a2, a3);
-  // CHECK: [[NEG:%.*]] = fsub <2 x double> {{.*}}, %a2
-  // CHECK: llvm.fma.v2f64(<2 x double> %a3, <2 x double> [[NEG]], <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x2_t test_vfms_lane_f32(float32x2_t a1, float32x2_t a2, float32x2_t a3) {
-  // CHECK: test_vfms_lane_f32
-  return vfms_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
-  // CHECK: llvm.fma.v2f32(<2 x float> {{.*}}, <2 x float> [[LANE]], <2 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float32x4_t test_vfmsq_lane_f32(float32x4_t a1, float32x4_t a2, float32x2_t a3) {
-  // CHECK: test_vfmsq_lane_f32
-  return vfmsq_lane_f32(a1, a2, a3, 1);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <2 x float> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[NEG]]
-  // CHECK: llvm.fma.v4f32(<4 x float> {{.*}}, <4 x float> [[LANE]], <4 x float> %a1)
-  // CHECK-NEXT: ret
-}
-
-float64x2_t test_vfmsq_lane_f64(float64x2_t a1, float64x2_t a2, float64x1_t a3) {
-  // CHECK: test_vfmsq_lane_f64
-  return vfmsq_lane_f64(a1, a2, a3, 0);
-  // NB: the test below is deliberately lose, so that we don't depend too much
-  // upon the exact IR used to select lane 1 (usually a shufflevector)
-  // CHECK: [[NEG:%.*]] = fsub <1 x double> {{.*}}, %a3
-  // CHECK: [[LANE:%.*]] = shufflevector <1 x double> [[NEG]]
-  // CHECK: llvm.fma.v2f64(<2 x double> {{.*}}, <2 x double> [[LANE]], <2 x double> %a1)
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vneg.c b/test/CodeGen/arm64_vneg.c
deleted file mode 100644
index d520ebd83af3e..0000000000000
--- a/test/CodeGen/arm64_vneg.c
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD negate and saturating negate intrinsics
-
-#include <arm_neon.h>
-
-int64x2_t test_vnegq_s64(int64x2_t a1) {
-  // CHECK: test_vnegq_s64
-  return vnegq_s64(a1);
-  // CHECK: sub <2 x i64> zeroinitializer, %a1
-  // CHECK-NEXT: ret
-}
-
-int64x2_t test_vqnegq_s64(int64x2_t a1) {
-  // CHECK: test_vqnegq_s64
-  return vqnegq_s64(a1);
-  // CHECK: llvm.aarch64.neon.sqneg.v2i64
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vqmov.c b/test/CodeGen/arm64_vqmov.c
deleted file mode 100644
index 6480e669e5c7c..0000000000000
--- a/test/CodeGen/arm64_vqmov.c
+++ /dev/null
@@ -1,77 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-/// Test vqmov[u]n_high_<su>{16,32,64) ARM64 intrinsics
-
-#include <arm_neon.h>
-
-// vqmovn_high_s16 -> UQXTN2 Vd.16b,Vn.8h
-int8x16_t test_vqmovn_high_s16(int8x8_t Vdlow, int16x8_t Vn)
-{
-    return vqmovn_high_s16(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s16:
-  // CHECK: sqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s16 -> UQXTN2 Vd.16b,Vn.8h
-uint8x16_t test_vqmovun_high_s16(uint8x8_t Vdlow, uint16x8_t Vn)
-{
-    return vqmovun_high_s16(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s16:
-  // CHECK: sqxtun2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_s32 -> SQXTN2 Vd.8h,Vn.4s
-int16x8_t test_vqmovn_high_s32(int16x4_t Vdlow, int32x4_t Vn)
-{
-    return vqmovn_high_s32(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s32:
-  // CHECK: sqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u32 -> UQXTN2 Vd.8h,Vn.4s
-uint16x8_t test_vqmovn_high_u32(uint16x4_t Vdlow, uint32x4_t Vn)
-{
-    return vqmovn_high_u32(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u32:
-  // CHECK: uqxtn2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_s64 -> SQXTN2 Vd.4s,Vn.2d
-int32x4_t test_vqmovn_high_s64(int32x2_t Vdlow, int64x2_t Vn)
-{
-    return vqmovn_high_s64(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_s64:
-  // CHECK: sqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u64 -> UQXTN2 Vd.4s,Vn.2d
-uint32x4_t test_vqmovn_high_u64(uint32x2_t Vdlow, uint64x2_t Vn)
-{
-    return vqmovn_high_u64(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u64:
-  // CHECK: uqxtn2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovn_high_u16 -> UQXTN2 Vd.16b,Vn.8h
-uint8x16_t test_vqmovn_high_u16(uint8x8_t Vdlow, uint16x8_t Vn)
-{
-    return vqmovn_high_u16(Vdlow, Vn);
-  // CHECK: test_vqmovn_high_u16:
-  // CHECK: uqxtn2.16b {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s32 -> SQXTUN2 Vd.8h,Vn.4s
-uint16x8_t test_vqmovun_high_s32(uint16x4_t Vdlow, uint32x4_t Vn)
-{
-    return vqmovun_high_s32(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s32:
-  // CHECK: sqxtun2.8h {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
-
-// vqmovun_high_s64 -> SQXTUN2  Vd.4s,Vn.2d
-uint32x4_t test_vqmovun_high_s64(uint32x2_t Vdlow, uint64x2_t Vn)
-{
-    return vqmovun_high_s64(Vdlow, Vn);
-  // CHECK: test_vqmovun_high_s64:
-  // CHECK: sqxtun2.4s {{v[0-9][0-9]*}}, {{v[0-9][0-9]*}}
-}
diff --git a/test/CodeGen/arm64_vrecps.c b/test/CodeGen/arm64_vrecps.c
deleted file mode 100644
index a3af13c37fca6..0000000000000
--- a/test/CodeGen/arm64_vrecps.c
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -O3 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | FileCheck %s
-// REQUIRES: aarch64-registered-target
-/// Test vrecpss_f32, vrecpsd_f64 ARM64 intrinsics
-
-
-#include <arm_neon.h>
-
-// vrecpss_f32 -> FRECPS Sd,Sa,Sb
-//
-float32_t test_vrecpss_f32(float32_t Vdlow, float32_t Vn)
-{
-    return vrecpss_f32(Vdlow, Vn);
-  // CHECK: test_vrecpss_f32:
-  // CHECK: frecps  s0, s0, s1
-  // CHECK-NEXT: ret
-}
-
-// vrecpsd_f64 -> FRECPS Dd,Da,Db
-//
-float64_t test_vrecpsd_f64(float64_t Vdlow, float64_t Vn)
-{
-    return vrecpsd_f64(Vdlow, Vn);
-  // CHECK: test_vrecpsd_f64:
-  // CHECK: frecps d0, d0, d1
-  // CHECK-NEXT: ret
-}
diff --git a/test/CodeGen/arm64_vshift.c b/test/CodeGen/arm64_vshift.c
deleted file mode 100644
index af028994f4a50..0000000000000
--- a/test/CodeGen/arm64_vshift.c
+++ /dev/null
@@ -1,357 +0,0 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios7.0 -target-feature +neon -ffreestanding -emit-llvm -o - -O1 %s | FileCheck %s
-#include <arm_neon.h>
-
-int8x8_t test_vqshl_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.sqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshl_n_s8(in, 1);
-}
-
-int16x4_t test_vqshl_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.sqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshl_n_s16(in, 1);
-}
-
-int32x2_t test_vqshl_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.sqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshl_n_s32(in, 1);
-}
-
-int64x1_t test_vqshl_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vqshl_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.sqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshl_n_s64(in, 1);
-}
-
-
-int8x16_t test_vqshlq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.sqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlq_n_s8(in, 1);
-}
-
-int16x8_t test_vqshlq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.sqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshlq_n_s16(in, 1);
-}
-
-int32x4_t test_vqshlq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.sqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshlq_n_s32(in, 1);
-}
-
-int64x2_t test_vqshlq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.sqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshlq_n_s64(in, 1);
-}
-
-uint8x8_t test_vqshl_n_u8(uint8x8_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.uqshl.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshl_n_u8(in, 1);
-}
-
-uint16x4_t test_vqshl_n_u16(uint16x4_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.uqshl.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshl_n_u16(in, 1);
-}
-
-uint32x2_t test_vqshl_n_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.uqshl.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshl_n_u32(in, 1);
-}
-
-uint64x1_t test_vqshl_n_u64(uint64x1_t in) {
-  // CHECK-LABEL: @test_vqshl_n_u64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.uqshl.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshl_n_u64(in, 1);
-}
-
-uint8x16_t test_vqshlq_n_u8(uint8x16_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.uqshl.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlq_n_u8(in, 1);
-}
-
-uint16x8_t test_vqshlq_n_u16(uint16x8_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.uqshl.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshlq_n_u16(in, 1);
-}
-
-uint32x4_t test_vqshlq_n_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.uqshl.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshlq_n_u32(in, 1);
-}
-
-uint64x2_t test_vqshlq_n_u64(uint64x2_t in) {
-  // CHECK-LABEL: @test_vqshlq_n_u64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.uqshl.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshlq_n_u64(in, 1);
-}
-
-int8x8_t test_vrshr_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshr_n_s8(in, 1);
-}
-
-int16x4_t test_vrshr_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshr_n_s16(in, 1);
-}
-
-int32x2_t test_vrshr_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  return vrshr_n_s32(in, 1);
-}
-
-int64x1_t test_vrshr_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vrshr_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  return vrshr_n_s64(in, 1);
-}
-
-
-int8x16_t test_vrshrq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshrq_n_s8(in, 1);
-}
-
-int16x8_t test_vrshrq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshrq_n_s16(in, 1);
-}
-
-int32x4_t test_vrshrq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  return vrshrq_n_s32(in, 1);
-}
-
-int64x2_t test_vrshrq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
-  return vrshrq_n_s64(in, 1);
-}
-
-uint8x8_t test_vrshr_n_u8(uint8x8_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshr_n_u8(in, 1);
-}
-
-uint16x4_t test_vrshr_n_u16(uint16x4_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshr_n_u16(in, 1);
-}
-
-uint32x2_t test_vrshr_n_u32(uint32x2_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  return vrshr_n_u32(in, 1);
-}
-
-uint64x1_t test_vrshr_n_u64(uint64x1_t in) {
-  // CHECK-LABEL: @test_vrshr_n_u64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  return vrshr_n_u64(in, 1);
-}
-
-uint8x16_t test_vrshrq_n_u8(uint8x16_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  return vrshrq_n_u8(in, 1);
-}
-
-uint16x8_t test_vrshrq_n_u16(uint16x8_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  return vrshrq_n_u16(in, 1);
-}
-
-uint32x4_t test_vrshrq_n_u32(uint32x4_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  return vrshrq_n_u32(in, 1);
-}
-
-uint64x2_t test_vrshrq_n_u64(uint64x2_t in) {
-  // CHECK-LABEL: @test_vrshrq_n_u64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>
-  return vrshrq_n_u64(in, 1);
-}
-
-int8x8_t test_vqshlu_n_s8(int8x8_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s8
-  // CHECK: call <8 x i8> @llvm.aarch64.neon.sqshlu.v8i8(<8 x i8> %in, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshlu_n_s8(in, 1);
-}
-
-int16x4_t test_vqshlu_n_s16(int16x4_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s16
-  // CHECK: call <4 x i16> @llvm.aarch64.neon.sqshlu.v4i16(<4 x i16> %in, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
-  return vqshlu_n_s16(in, 1);
-}
-
-int32x2_t test_vqshlu_n_s32(int32x2_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s32
-  // CHECK: call <2 x i32> @llvm.aarch64.neon.sqshlu.v2i32(<2 x i32> %in, <2 x i32> <i32 1, i32 1>)
-  return vqshlu_n_s32(in, 1);
-}
-
-int64x1_t test_vqshlu_n_s64(int64x1_t in) {
-  // CHECK-LABEL: @test_vqshlu_n_s64
-  // CHECK: call <1 x i64> @llvm.aarch64.neon.sqshlu.v1i64(<1 x i64> %in, <1 x i64> <i64 1>)
-  return vqshlu_n_s64(in, 1);
-}
-
-
-int8x16_t test_vqshluq_n_s8(int8x16_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s8
-  // CHECK: call <16 x i8> @llvm.aarch64.neon.sqshlu.v16i8(<16 x i8> %in, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
-  return vqshluq_n_s8(in, 1);
-}
-
-int16x8_t test_vqshluq_n_s16(int16x8_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s16
-  // CHECK: call <8 x i16> @llvm.aarch64.neon.sqshlu.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
-  return vqshluq_n_s16(in, 1);
-}
-
-int32x4_t test_vqshluq_n_s32(int32x4_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s32
-  // CHECK: call <4 x i32> @llvm.aarch64.neon.sqshlu.v4i32(<4 x i32> %in, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
-  return vqshluq_n_s32(in, 1);
-}
-
-int64x2_t test_vqshluq_n_s64(int64x2_t in) {
-  // CHECK-LABEL: @test_vqshluq_n_s64
-  // CHECK: call <2 x i64> @llvm.aarch64.neon.sqshlu.v2i64(<2 x i64> %in, <2 x i64> <i64 1, i64 1>
-  return vqshluq_n_s64(in, 1);
-}
-
-int8x8_t test_vrsra_n_s8(int8x8_t acc, int8x8_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.aarch64.neon.srshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <8 x i8> [[TMP]], %acc
-  return vrsra_n_s8(acc, in, 1);
-}
-
-int16x4_t test_vrsra_n_s16(int16x4_t acc, int16x4_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.aarch64.neon.srshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <4 x i16> [[TMP]], %acc
-  return vrsra_n_s16(acc, in, 1);
-}
-
-int32x2_t test_vrsra_n_s32(int32x2_t acc, int32x2_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.aarch64.neon.srshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  // CHECK: add <2 x i32> [[TMP]], %acc
-  return vrsra_n_s32(acc, in, 1);
-}
-
-int64x1_t test_vrsra_n_s64(int64x1_t acc, int64x1_t in) {
-  // CHECK-LABEL: @test_vrsra_n_s64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.aarch64.neon.srshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  // CHECK: add <1 x i64> [[TMP]], %acc
-  return vrsra_n_s64(acc, in, 1);
-}
-
-int8x16_t test_vrsraq_n_s8(int8x16_t acc, int8x16_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.aarch64.neon.srshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <16 x i8> [[TMP]], %acc
-  return vrsraq_n_s8(acc, in, 1);
-}
-
-int16x8_t test_vrsraq_n_s16(int16x8_t acc, int16x8_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.aarch64.neon.srshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <8 x i16> [[TMP]], %acc
-  return vrsraq_n_s16(acc, in, 1);
-}
-
-int32x4_t test_vrsraq_n_s32(int32x4_t acc, int32x4_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.aarch64.neon.srshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  // CHECK: add <4 x i32> [[TMP]], %acc
-  return vrsraq_n_s32(acc, in, 1);
-}
-
-int64x2_t test_vrsraq_n_s64(int64x2_t acc, int64x2_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_s64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.aarch64.neon.srshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
-  // CHECK: add <2 x i64> [[TMP]], %acc
-  return vrsraq_n_s64(acc, in, 1);
-}
-
-uint8x8_t test_vrsra_n_u8(uint8x8_t acc, uint8x8_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i8> @llvm.aarch64.neon.urshl.v8i8(<8 x i8> %in, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <8 x i8> [[TMP]], %acc
-  return vrsra_n_u8(acc, in, 1);
-}
-
-uint16x4_t test_vrsra_n_u16(uint16x4_t acc, uint16x4_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i16> @llvm.aarch64.neon.urshl.v4i16(<4 x i16> %in, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <4 x i16> [[TMP]], %acc
-  return vrsra_n_u16(acc, in, 1);
-}
-
-uint32x2_t test_vrsra_n_u32(uint32x2_t acc, uint32x2_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i32> @llvm.aarch64.neon.urshl.v2i32(<2 x i32> %in, <2 x i32> <i32 -1, i32 -1>)
-  // CHECK: add <2 x i32> [[TMP]], %acc
-  return vrsra_n_u32(acc, in, 1);
-}
-
-uint64x1_t test_vrsra_n_u64(uint64x1_t acc, uint64x1_t in) {
-  // CHECK-LABEL: @test_vrsra_n_u64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <1 x i64> @llvm.aarch64.neon.urshl.v1i64(<1 x i64> %in, <1 x i64> <i64 -1>)
-  // CHECK: add <1 x i64> [[TMP]], %acc
-  return vrsra_n_u64(acc, in, 1);
-}
-
-uint8x16_t test_vrsraq_n_u8(uint8x16_t acc, uint8x16_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u8
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <16 x i8> @llvm.aarch64.neon.urshl.v16i8(<16 x i8> %in, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
-  // CHECK: add <16 x i8> [[TMP]], %acc
-  return vrsraq_n_u8(acc, in, 1);
-}
-
-uint16x8_t test_vrsraq_n_u16(uint16x8_t acc, uint16x8_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u16
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <8 x i16> @llvm.aarch64.neon.urshl.v8i16(<8 x i16> %in, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
-  // CHECK: add <8 x i16> [[TMP]], %acc
-  return vrsraq_n_u16(acc, in, 1);
-}
-
-uint32x4_t test_vrsraq_n_u32(uint32x4_t acc, uint32x4_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u32
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <4 x i32> @llvm.aarch64.neon.urshl.v4i32(<4 x i32> %in, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
-  // CHECK: add <4 x i32> [[TMP]], %acc
-  return vrsraq_n_u32(acc, in, 1);
-}
-
-uint64x2_t test_vrsraq_n_u64(uint64x2_t acc, uint64x2_t in) {
-  // CHECK-LABEL: @test_vrsraq_n_u64
-  // CHECK: [[TMP:%[0-9a-zA-Z._]+]] = tail call <2 x i64> @llvm.aarch64.neon.urshl.v2i64(<2 x i64> %in, <2 x i64> <i64 -1, i64 -1>)
-  // CHECK: add <2 x i64> [[TMP]], %acc
-  return vrsraq_n_u64(acc, in, 1);
-}
diff --git a/test/CodeGen/arm64_vsli.c b/test/CodeGen/arm64_vsli.c
deleted file mode 100644
index b2a30abe382aa..0000000000000
--- a/test/CodeGen/arm64_vsli.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | \
-// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
-// REQUIRES: aarch64-registered-target
-// Test
-
-#include <arm_neon.h>
-
-int8x8_t test_vsli_n_s8(int8x8_t a1, int8x8_t a2) {
-  // CHECK: test_vsli_n_s8
-  return vsli_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #3
-}
-
-int16x4_t test_vsli_n_s16(int16x4_t a1, int16x4_t a2) {
-  // CHECK: test_vsli_n_s16
-  return vsli_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #3
-}
-
-int32x2_t test_vsli_n_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK: test_vsli_n_s32
-  return vsli_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i32
-  // CHECK_CODEGEN: sli.2s  v0, v1, #1
-}
-
-int64x1_t test_vsli_n_s64(int64x1_t a1, int64x1_t a2) {
-  // CHECK: test_vsli_n_s64
-  return vsli_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v1i64
-  // CHECK_CODEGEN: sli     d0, d1, #1
-}
-
-uint8x8_t test_vsli_n_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK: test_vsli_n_u8
-  return vsli_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #3
-}
-
-uint16x4_t test_vsli_n_u16(uint16x4_t a1, uint16x4_t a2) {
-  // CHECK: test_vsli_n_u16
-  return vsli_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #3
-}
-
-uint32x2_t test_vsli_n_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK: test_vsli_n_u32
-  return vsli_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i32
-  // CHECK_CODEGEN: sli.2s  v0, v1, #1
-}
-
-uint64x1_t test_vsli_n_u64(uint64x1_t a1, uint64x1_t a2) {
-  // CHECK: test_vsli_n_u64
-  return vsli_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v1i64
-  // CHECK_CODEGEN: sli     d0, d1, #1
-}
-
-poly8x8_t test_vsli_n_p8(poly8x8_t a1, poly8x8_t a2) {
-  // CHECK: test_vsli_n_p8
-  return vsli_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v8i8
-  // CHECK_CODEGEN: sli.8b  v0, v1, #1
-}
-
-poly16x4_t test_vsli_n_p16(poly16x4_t a1, poly16x4_t a2) {
-  // CHECK: test_vsli_n_p16
-  return vsli_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i16
-  // CHECK_CODEGEN: sli.4h  v0, v1, #1
-}
-
-int8x16_t test_vsliq_n_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK: test_vsliq_n_s8
-  return vsliq_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #3
-}
-
-int16x8_t test_vsliq_n_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK: test_vsliq_n_s16
-  return vsliq_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #3
-}
-
-int32x4_t test_vsliq_n_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK: test_vsliq_n_s32
-  return vsliq_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i32
-  // CHECK_CODEGEN: sli.4s  v0, v1, #1
-}
-
-int64x2_t test_vsliq_n_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vsliq_n_s64
-  return vsliq_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i64
-  // CHECK_CODEGEN: sli.2d  v0, v1, #1
-}
-
-uint8x16_t test_vsliq_n_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK: test_vsliq_n_u8
-  return vsliq_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #3
-}
-
-uint16x8_t test_vsliq_n_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK: test_vsliq_n_u16
-  return vsliq_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #3
-}
-
-uint32x4_t test_vsliq_n_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK: test_vsliq_n_u32
-  return vsliq_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v4i32
-  // CHECK_CODEGEN: sli.4s  v0, v1, #1
-}
-
-uint64x2_t test_vsliq_n_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vsliq_n_u64
-  return vsliq_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v2i64
-  // CHECK_CODEGEN: sli.2d  v0, v1, #1
-}
-
-poly8x16_t test_vsliq_n_p8(poly8x16_t a1, poly8x16_t a2) {
-  // CHECK: test_vsliq_n_p8
-  return vsliq_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v16i8
-  // CHECK_CODEGEN: sli.16b v0, v1, #1
-}
-
-poly16x8_t test_vsliq_n_p16(poly16x8_t a1, poly16x8_t a2) {
-  // CHECK: test_vsliq_n_p16
-  return vsliq_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsli.v8i16
-  // CHECK_CODEGEN: sli.8h  v0, v1, #1
-}
-
diff --git a/test/CodeGen/arm64_vsri.c b/test/CodeGen/arm64_vsri.c
deleted file mode 100644
index 579431ddf0209..0000000000000
--- a/test/CodeGen/arm64_vsri.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - %s | \
-// RUN:   FileCheck -check-prefix=CHECK_CODEGEN %s
-// REQUIRES: aarch64-registered-target
-
-// Test ARM64 SIMD vector shift right and insert: vsri[q]_n_*
-
-#include <arm_neon.h>
-
-int8x8_t test_vsri_n_s8(int8x8_t a1, int8x8_t a2) {
-  // CHECK: test_vsri_n_s8
-  return vsri_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #3
-}
-
-int16x4_t test_vsri_n_s16(int16x4_t a1, int16x4_t a2) {
-  // CHECK: test_vsri_n_s16
-  return vsri_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #3
-}
-
-int32x2_t test_vsri_n_s32(int32x2_t a1, int32x2_t a2) {
-  // CHECK: test_vsri_n_s32
-  return vsri_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i32
-  // CHECK_CODEGEN: sri.2s  v0, v1, #1
-}
-
-int64x1_t test_vsri_n_s64(int64x1_t a1, int64x1_t a2) {
-  // CHECK: test_vsri_n_s64
-  return vsri_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v1i64
-  // CHECK_CODEGEN: sri     d0, d1, #1
-}
-
-uint8x8_t test_vsri_n_u8(uint8x8_t a1, uint8x8_t a2) {
-  // CHECK: test_vsri_n_u8
-  return vsri_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #3
-}
-
-uint16x4_t test_vsri_n_u16(uint16x4_t a1, uint16x4_t a2) {
-  // CHECK: test_vsri_n_u16
-  return vsri_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #3
-}
-
-uint32x2_t test_vsri_n_u32(uint32x2_t a1, uint32x2_t a2) {
-  // CHECK: test_vsri_n_u32
-  return vsri_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i32
-  // CHECK_CODEGEN: sri.2s  v0, v1, #1
-}
-
-uint64x1_t test_vsri_n_u64(uint64x1_t a1, uint64x1_t a2) {
-  // CHECK: test_vsri_n_u64
-  return vsri_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v1i64
-  // CHECK_CODEGEN: sri     d0, d1, #1
-}
-
-poly8x8_t test_vsri_n_p8(poly8x8_t a1, poly8x8_t a2) {
-  // CHECK: test_vsri_n_p8
-  return vsri_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v8i8
-  // CHECK_CODEGEN: sri.8b  v0, v1, #1
-}
-
-poly16x4_t test_vsri_n_p16(poly16x4_t a1, poly16x4_t a2) {
-  // CHECK: test_vsri_n_p16
-  return vsri_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i16
-  // CHECK_CODEGEN: sri.4h  v0, v1, #1
-}
-
-int8x16_t test_vsriq_n_s8(int8x16_t a1, int8x16_t a2) {
-  // CHECK: test_vsriq_n_s8
-  return vsriq_n_s8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #3
-}
-
-int16x8_t test_vsriq_n_s16(int16x8_t a1, int16x8_t a2) {
-  // CHECK: test_vsriq_n_s16
-  return vsriq_n_s16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #3
-}
-
-int32x4_t test_vsriq_n_s32(int32x4_t a1, int32x4_t a2) {
-  // CHECK: test_vsriq_n_s32
-  return vsriq_n_s32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i32
-  // CHECK_CODEGEN: sri.4s  v0, v1, #1
-}
-
-int64x2_t test_vsriq_n_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK: test_vsriq_n_s64
-  return vsriq_n_s64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i64
-  // CHECK_CODEGEN: sri.2d  v0, v1, #1
-}
-
-uint8x16_t test_vsriq_n_u8(uint8x16_t a1, uint8x16_t a2) {
-  // CHECK: test_vsriq_n_u8
-  return vsriq_n_u8(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #3
-}
-
-uint16x8_t test_vsriq_n_u16(uint16x8_t a1, uint16x8_t a2) {
-  // CHECK: test_vsriq_n_u16
-  return vsriq_n_u16(a1, a2, 3);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #3
-}
-
-uint32x4_t test_vsriq_n_u32(uint32x4_t a1, uint32x4_t a2) {
-  // CHECK: test_vsriq_n_u32
-  return vsriq_n_u32(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v4i32
-  // CHECK_CODEGEN: sri.4s  v0, v1, #1
-}
-
-uint64x2_t test_vsriq_n_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK: test_vsriq_n_u64
-  return vsriq_n_u64(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v2i64
-  // CHECK_CODEGEN: sri.2d  v0, v1, #1
-}
-
-poly8x16_t test_vsriq_n_p8(poly8x16_t a1, poly8x16_t a2) {
-  // CHECK: test_vsriq_n_p8
-  return vsriq_n_p8(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v16i8
-  // CHECK_CODEGEN: sri.16b v0, v1, #1
-}
-
-poly16x8_t test_vsriq_n_p16(poly16x8_t a1, poly16x8_t a2) {
-  // CHECK: test_vsriq_n_p16
-  return vsriq_n_p16(a1, a2, 1);
-  // CHECK: llvm.aarch64.neon.vsri.v8i16
-  // CHECK_CODEGEN: sri.8h  v0, v1, #1
-}
-
diff --git a/test/CodeGen/arm64_vtst.c b/test/CodeGen/arm64_vtst.c
deleted file mode 100644
index 9f3ed84236451..0000000000000
--- a/test/CodeGen/arm64_vtst.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 -O1 -triple arm64-apple-ios7 -target-feature +neon -ffreestanding -S -o - -emit-llvm %s | FileCheck %s
-// Test ARM64 SIMD comparison test intrinsics
-
-#include <arm_neon.h>
-
-uint64x2_t test_vtstq_s64(int64x2_t a1, int64x2_t a2) {
-  // CHECK-LABEL: test_vtstq_s64
-  return vtstq_s64(a1, a2);
-  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> {{%a1, %a2|%a2, %a1}}
-  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
-  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
-  // CHECK: ret <2 x i64> [[RES]]
-}
-
-uint64x2_t test_vtstq_u64(uint64x2_t a1, uint64x2_t a2) {
-  // CHECK-LABEL: test_vtstq_u64
-  return vtstq_u64(a1, a2);
-  // CHECK: [[COMMONBITS:%[A-Za-z0-9.]+]] = and <2 x i64> {{%a1, %a2|%a2, %a1}}
-  // CHECK: [[MASK:%[A-Za-z0-9.]+]] = icmp ne <2 x i64> [[COMMONBITS]], zeroinitializer
-  // CHECK: [[RES:%[A-Za-z0-9.]+]] = sext <2 x i1> [[MASK]] to <2 x i64>
-  // CHECK: ret <2 x i64> [[RES]]
-}
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index d92c32c476aea..4cc7eedffd5b7 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -1,1611 +1,2398 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
-// RUN: %clang_cc1 -triple armv8-linux-gnu \
-// RUN:  -target-cpu cortex-a57 -mfloat-abi soft -ffreestanding -Os -S -o - %s\
-// RUN:  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-A57
+// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
+// RUN:  | opt -S -mem2reg | FileCheck %s
 
-// REQUIRES: long_tests
+// REQUIRES: long-tests
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: test_vaba_s8
-// CHECK: vaba.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vaba_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s16
-// CHECK: vaba.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vaba_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_s32
-// CHECK: vaba.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vaba_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u8
-// CHECK: vaba.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaba_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[VABD_V_I_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vaba_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u16
-// CHECK: vaba.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaba_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[TMP2]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vaba_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vaba_u32
-// CHECK: vaba.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaba_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[TMP2]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vaba_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s8
-// CHECK: vaba.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vabaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s16
-// CHECK: vaba.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vabaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_s32
-// CHECK: vaba.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vabaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u8
-// CHECK: vaba.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VABDQ_V_I_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[VABDQ_V_I_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vabaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u16
-// CHECK: vaba.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I_I]], <8 x i16> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[TMP2]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vabaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabaq_u32
-// CHECK: vaba.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VABDQ_V_I_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABDQ_V1_I_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABDQ_V2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I_I]], <4 x i32> [[VABDQ_V1_I_I]]) #4
+// CHECK:   [[VABDQ_V3_I_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I_I]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[TMP2]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vabaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabal_s8
-// CHECK: vabal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vabal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s16
-// CHECK: vabal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vabal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_s32
-// CHECK: vabal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vabal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u8
-// CHECK: vabal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VABD_V_I_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I_I]] to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vabal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u16
-// CHECK: vabal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I_I]], <4 x i16> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vabal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vabal_u32
-// CHECK: vabal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VABD_V_I_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I_I]], <2 x i32> [[VABD_V1_I_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I_I]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vabal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vabd_s8
-// CHECK: vabd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_V_I]]
 int8x8_t test_vabd_s8(int8x8_t a, int8x8_t b) {
   return vabd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s16
-// CHECK: vabd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vabd_s16(int16x4_t a, int16x4_t b) {
   return vabd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_s32
-// CHECK: vabd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vabd_s32(int32x2_t a, int32x2_t b) {
   return vabd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u8
-// CHECK: vabd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VABD_V_I]]
 uint8x8_t test_vabd_u8(uint8x8_t a, uint8x8_t b) {
   return vabd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u16
-// CHECK: vabd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I]], <4 x i16> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vabd_u16(uint16x4_t a, uint16x4_t b) {
   return vabd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabd_u32
-// CHECK: vabd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I]], <2 x i32> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vabd_u32(uint32x2_t a, uint32x2_t b) {
   return vabd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabd_f32
-// CHECK: vabd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VABD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VABD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> [[VABD_V_I]], <2 x float> [[VABD_V1_I]]) #4
+// CHECK:   [[VABD_V3_I:%.*]] = bitcast <2 x float> [[VABD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vabd_f32(float32x2_t a, float32x2_t b) {
   return vabd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s8
-// CHECK: vabd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 int8x16_t test_vabdq_s8(int8x16_t a, int8x16_t b) {
   return vabdq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s16
-// CHECK: vabd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vabdq_s16(int16x8_t a, int16x8_t b) {
   return vabdq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_s32
-// CHECK: vabd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vabdq_s32(int32x4_t a, int32x4_t b) {
   return vabdq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u8
-// CHECK: vabd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabdq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VABDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VABDQ_V_I]]
 uint8x16_t test_vabdq_u8(uint8x16_t a, uint8x16_t b) {
   return vabdq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u16
-// CHECK: vabd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> [[VABDQ_V_I]], <8 x i16> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <8 x i16> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vabdq_u16(uint16x8_t a, uint16x8_t b) {
   return vabdq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_u32
-// CHECK: vabd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> [[VABDQ_V_I]], <4 x i32> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x i32> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vabdq_u32(uint32x4_t a, uint32x4_t b) {
   return vabdq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vabdq_f32
-// CHECK: vabd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabdq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VABDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VABDQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> [[VABDQ_V_I]], <4 x float> [[VABDQ_V1_I]]) #4
+// CHECK:   [[VABDQ_V3_I:%.*]] = bitcast <4 x float> [[VABDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VABDQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vabdq_f32(float32x4_t a, float32x4_t b) {
   return vabdq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabdl_s8
-// CHECK: vabdl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 int16x8_t test_vabdl_s8(int8x8_t a, int8x8_t b) {
   return vabdl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s16
-// CHECK: vabdl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 int32x4_t test_vabdl_s16(int16x4_t a, int16x4_t b) {
   return vabdl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_s32
-// CHECK: vabdl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 int64x2_t test_vabdl_s32(int32x2_t a, int32x2_t b) {
   return vabdl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u8
-// CHECK: vabdl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabdl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VABD_V_I_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> [[VABD_V_I_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I_I]]
 uint16x8_t test_vabdl_u8(uint8x8_t a, uint8x8_t b) {
   return vabdl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u16
-// CHECK: vabdl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabdl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> [[VABD_V_I_I]], <4 x i16> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <4 x i16> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I_I]]
 uint32x4_t test_vabdl_u16(uint16x4_t a, uint16x4_t b) {
   return vabdl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vabdl_u32
-// CHECK: vabdl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vabdl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VABD_V_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABD_V1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VABD_V2_I_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> [[VABD_V_I_I]], <2 x i32> [[VABD_V1_I_I]]) #4
+// CHECK:   [[VABD_V3_I_I:%.*]] = bitcast <2 x i32> [[VABD_V2_I_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VABD_V3_I_I]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <2 x i32> [[TMP2]] to <8 x i8>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP3]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I_I]]
 uint64x2_t test_vabdl_u32(uint32x2_t a, uint32x2_t b) {
   return vabdl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vabs_s8
-// CHECK: vabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <8 x i8> @llvm.arm.neon.vabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VABS_I]]
 int8x8_t test_vabs_s8(int8x8_t a) {
   return vabs_s8(a);
 }
 
-// CHECK-LABEL: test_vabs_s16
-// CHECK: vabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vabs.v4i16(<4 x i16> [[VABS_I]]) #4
+// CHECK:   ret <4 x i16> [[VABS1_I]]
 int16x4_t test_vabs_s16(int16x4_t a) {
   return vabs_s16(a);
 }
 
-// CHECK-LABEL: test_vabs_s32
-// CHECK: vabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vabs.v2i32(<2 x i32> [[VABS_I]]) #4
+// CHECK:   ret <2 x i32> [[VABS1_I]]
 int32x2_t test_vabs_s32(int32x2_t a) {
   return vabs_s32(a);
 }
 
-// CHECK-LABEL: test_vabs_f32
-// CHECK: vabs.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vabs_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <2 x float> @llvm.fabs.v2f32(<2 x float> [[VABS_I]]) #4
+// CHECK:   ret <2 x float> [[VABS1_I]]
 float32x2_t test_vabs_f32(float32x2_t a) {
   return vabs_f32(a);
 }
 
-// CHECK-LABEL: test_vabsq_s8
-// CHECK: vabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VABS_I:%.*]] = call <16 x i8> @llvm.arm.neon.vabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VABS_I]]
 int8x16_t test_vabsq_s8(int8x16_t a) {
   return vabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vabsq_s16
-// CHECK: vabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VABS1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> [[VABS_I]]) #4
+// CHECK:   ret <8 x i16> [[VABS1_I]]
 int16x8_t test_vabsq_s16(int16x8_t a) {
   return vabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vabsq_s32
-// CHECK: vabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32> [[VABS_I]]) #4
+// CHECK:   ret <4 x i32> [[VABS1_I]]
 int32x4_t test_vabsq_s32(int32x4_t a) {
   return vabsq_s32(a);
 }
 
-// CHECK-LABEL: test_vabsq_f32
-// CHECK: vabs.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vabsq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VABS_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VABS1_I:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[VABS_I]]) #4
+// CHECK:   ret <4 x float> [[VABS1_I]]
 float32x4_t test_vabsq_f32(float32x4_t a) {
   return vabsq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vadd_s8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vadd_s8(int8x8_t a, int8x8_t b) {
   return vadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vadd_s16(int16x4_t a, int16x4_t b) {
   return vadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vadd_s32(int32x2_t a, int32x2_t b) {
   return vadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_s64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vadd_s64(int64x1_t a, int64x1_t b) {
   return vadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vadd_f32
-// CHECK: vadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vadd_f32(float32x2_t a, float32x2_t b) {
   return vadd_f32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u8
-// CHECK: vadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vadd_u8(uint8x8_t a, uint8x8_t b) {
   return vadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u16
-// CHECK: vadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vadd_u16(uint16x4_t a, uint16x4_t b) {
   return vadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u32
-// CHECK: vadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vadd_u32(uint32x2_t a, uint32x2_t b) {
   return vadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vadd_u64
-// CHECK: vadd.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vadd_u64(uint64x1_t a, uint64x1_t b) {
   return vadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vaddq_s8(int8x16_t a, int8x16_t b) {
   return vaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddq_s16(int16x8_t a, int16x8_t b) {
   return vaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddq_s32(int32x4_t a, int32x4_t b) {
   return vaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_s64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddq_s64(int64x2_t a, int64x2_t b) {
   return vaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_f32
-// CHECK: vadd.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vaddq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vaddq_f32(float32x4_t a, float32x4_t b) {
   return vaddq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u8
-// CHECK: vadd.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u16
-// CHECK: vadd.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u32
-// CHECK: vadd.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddq_u64
-// CHECK: vadd.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddhn_s16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 int8x8_t test_vaddhn_s16(int16x8_t a, int16x8_t b) {
   return vaddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 int16x4_t test_vaddhn_s32(int32x4_t a, int32x4_t b) {
   return vaddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_s64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 int32x2_t test_vaddhn_s64(int64x2_t a, int64x2_t b) {
   return vaddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u16
-// CHECK: vaddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vaddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VADDHN_I:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <8 x i16> [[VADDHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <8 x i16> [[VADDHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VADDHN2_I]]
 uint8x8_t test_vaddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vaddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u32
-// CHECK: vaddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vaddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VADDHN_I:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <4 x i32> [[VADDHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <4 x i32> [[VADDHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VADDHN2_I]]
 uint16x4_t test_vaddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vaddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vaddhn_u64
-// CHECK: vaddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vaddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VADDHN_I:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VADDHN1_I:%.*]] = lshr <2 x i64> [[VADDHN_I]], <i64 32, i64 32>
+// CHECK:   [[VADDHN2_I:%.*]] = trunc <2 x i64> [[VADDHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VADDHN2_I]]
 uint32x2_t test_vaddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vaddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddl_s8
-// CHECK: vaddl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddl_s8(int8x8_t a, int8x8_t b) {
   return vaddl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s16
-// CHECK: vaddl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddl_s16(int16x4_t a, int16x4_t b) {
   return vaddl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_s32
-// CHECK: vaddl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddl_s32(int32x2_t a, int32x2_t b) {
   return vaddl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u8
-// CHECK: vaddl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddl_u8(uint8x8_t a, uint8x8_t b) {
   return vaddl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u16
-// CHECK: vaddl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddl_u16(uint16x4_t a, uint16x4_t b) {
   return vaddl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddl_u32
-// CHECK: vaddl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddl_u32(uint32x2_t a, uint32x2_t b) {
   return vaddl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vaddw_s8
-// CHECK: vaddw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vaddw_s8(int16x8_t a, int8x8_t b) {
   return vaddw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s16
-// CHECK: vaddw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vaddw_s16(int32x4_t a, int16x4_t b) {
   return vaddw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_s32
-// CHECK: vaddw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vaddw_s32(int64x2_t a, int32x2_t b) {
   return vaddw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u8
-// CHECK: vaddw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vaddw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vaddw_u8(uint16x8_t a, uint8x8_t b) {
   return vaddw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u16
-// CHECK: vaddw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vaddw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vaddw_u16(uint32x4_t a, uint16x4_t b) {
   return vaddw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vaddw_u32
-// CHECK: vaddw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vaddw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vaddw_u32(uint64x2_t a, uint32x2_t b) {
   return vaddw_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vand_s8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vand_s8(int8x8_t a, int8x8_t b) {
   return vand_s8(a, b);
 }
 
-// CHECK-LABEL: test_vand_s16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vand_s16(int16x4_t a, int16x4_t b) {
   return vand_s16(a, b);
 }
 
-// CHECK-LABEL: test_vand_s32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vand_s32(int32x2_t a, int32x2_t b) {
   return vand_s32(a, b);
 }
 
-// CHECK-LABEL: test_vand_s64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vand_s64(int64x1_t a, int64x1_t b) {
   return vand_s64(a, b);
 }
 
-// CHECK-LABEL: test_vand_u8
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vand_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vand_u8(uint8x8_t a, uint8x8_t b) {
   return vand_u8(a, b);
 }
 
-// CHECK-LABEL: test_vand_u16
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vand_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vand_u16(uint16x4_t a, uint16x4_t b) {
   return vand_u16(a, b);
 }
 
-// CHECK-LABEL: test_vand_u32
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vand_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vand_u32(uint32x2_t a, uint32x2_t b) {
   return vand_u32(a, b);
 }
 
-// CHECK-LABEL: test_vand_u64
-// CHECK: vand d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vand_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vand_u64(uint64x1_t a, uint64x1_t b) {
   return vand_u64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vandq_s8(int8x16_t a, int8x16_t b) {
   return vandq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vandq_s16(int16x8_t a, int16x8_t b) {
   return vandq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vandq_s32(int32x4_t a, int32x4_t b) {
   return vandq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_s64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vandq_s64(int64x2_t a, int64x2_t b) {
   return vandq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u8
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vandq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vandq_u8(uint8x16_t a, uint8x16_t b) {
   return vandq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u16
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vandq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vandq_u16(uint16x8_t a, uint16x8_t b) {
   return vandq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u32
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vandq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vandq_u32(uint32x4_t a, uint32x4_t b) {
   return vandq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vandq_u64
-// CHECK: vand q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vandq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vandq_u64(uint64x2_t a, uint64x2_t b) {
   return vandq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbic_s8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 int8x8_t test_vbic_s8(int8x8_t a, int8x8_t b) {
   return vbic_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 int16x4_t test_vbic_s16(int16x4_t a, int16x4_t b) {
   return vbic_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 int32x2_t test_vbic_s32(int32x2_t a, int32x2_t b) {
   return vbic_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_s64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 int64x1_t test_vbic_s64(int64x1_t a, int64x1_t b) {
   return vbic_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u8
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbic_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[AND_I]]
 uint8x8_t test_vbic_u8(uint8x8_t a, uint8x8_t b) {
   return vbic_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u16
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbic_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[AND_I]]
 uint16x4_t test_vbic_u16(uint16x4_t a, uint16x4_t b) {
   return vbic_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u32
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbic_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[AND_I]]
 uint32x2_t test_vbic_u32(uint32x2_t a, uint32x2_t b) {
   return vbic_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbic_u64
-// CHECK: vbic d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbic_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[AND_I]]
 uint64x1_t test_vbic_u64(uint64x1_t a, uint64x1_t b) {
   return vbic_u64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 int8x16_t test_vbicq_s8(int8x16_t a, int8x16_t b) {
   return vbicq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 int16x8_t test_vbicq_s16(int16x8_t a, int16x8_t b) {
   return vbicq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 int32x4_t test_vbicq_s32(int32x4_t a, int32x4_t b) {
   return vbicq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_s64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 int64x2_t test_vbicq_s64(int64x2_t a, int64x2_t b) {
   return vbicq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u8
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbicq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[AND_I:%.*]] = and <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[AND_I]]
 uint8x16_t test_vbicq_u8(uint8x16_t a, uint8x16_t b) {
   return vbicq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u16
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbicq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[AND_I:%.*]] = and <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[AND_I]]
 uint16x8_t test_vbicq_u16(uint16x8_t a, uint16x8_t b) {
   return vbicq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u32
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbicq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[AND_I:%.*]] = and <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[AND_I]]
 uint32x4_t test_vbicq_u32(uint32x4_t a, uint32x4_t b) {
   return vbicq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vbicq_u64
-// CHECK: vbic q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbicq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[AND_I:%.*]] = and <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[AND_I]]
 uint64x2_t test_vbicq_u64(uint64x2_t a, uint64x2_t b) {
   return vbicq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vbsl_s8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 int8x8_t test_vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) {
   return vbsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 int16x4_t test_vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c) {
   return vbsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP3]]
 int32x2_t test_vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c) {
   return vbsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_s64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP3]]
 int64x1_t test_vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) {
   return vbsl_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 uint8x8_t test_vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vbsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 uint16x4_t test_vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vbsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vbsl_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP3]]
 uint32x2_t test_vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vbsl_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_u64
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <1 x i64> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP3]]
 uint64x1_t test_vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c) {
   return vbsl_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_f32
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vbsl_f32(<2 x i32> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP3]]
 float32x2_t test_vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) {
   return vbsl_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p8
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vbsl_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VBSL_V_I]]
 poly8x8_t test_vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c) {
   return vbsl_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbsl_p16
-// CHECK: vbsl d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vbsl_p16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP0]], <8 x i8> [[TMP1]], <8 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP3]]
 poly16x4_t test_vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c) {
   return vbsl_p16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 int8x16_t test_vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) {
   return vbslq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 int16x8_t test_vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) {
   return vbslq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP3]]
 int32x4_t test_vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) {
   return vbslq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_s64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP3]]
 int64x2_t test_vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) {
   return vbslq_s64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 uint8x16_t test_vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vbslq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 uint16x8_t test_vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vbslq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vbslq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP3]]
 uint32x4_t test_vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vbslq_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_u64
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i64> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP3]]
 uint64x2_t test_vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) {
   return vbslq_u64(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_f32
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vbslq_f32(<4 x i32> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP3]]
 float32x4_t test_vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) {
   return vbslq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p8
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vbslq_p8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #4
+// CHECK:   ret <16 x i8> [[VBSLQ_V_I]]
 poly8x16_t test_vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c) {
   return vbslq_p8(a, b, c);
 }
 
-// CHECK-LABEL: test_vbslq_p16
-// CHECK: vbsl q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vbslq_p16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %c to <16 x i8>
+// CHECK:   [[VBSLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]], <16 x i8> [[TMP2]]) #4
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[VBSLQ_V_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP3]]
 poly16x8_t test_vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c) {
   return vbslq_p16(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vcage_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcage_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGE_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCAGE_V_I]], <2 x float> [[VCAGE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGE_V2_I]]
 uint32x2_t test_vcage_f32(float32x2_t a, float32x2_t b) {
   return vcage_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcageq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcageq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCAGEQ_V_I]], <4 x float> [[VCAGEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGEQ_V2_I]]
 uint32x4_t test_vcageq_f32(float32x4_t a, float32x4_t b) {
   return vcageq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcagt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcagt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCAGT_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCAGT_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCAGT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCAGT_V_I]], <2 x float> [[VCAGT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCAGT_V2_I]]
 uint32x2_t test_vcagt_f32(float32x2_t a, float32x2_t b) {
   return vcagt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcagtq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcagtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCAGTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCAGTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCAGTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCAGTQ_V_I]], <4 x float> [[VCAGTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCAGTQ_V2_I]]
 uint32x4_t test_vcagtq_f32(float32x4_t a, float32x4_t b) {
   return vcagtq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcale_f32
-// CHECK: vacge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcale_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALE_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALE_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALE_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float> [[VCALE_V_I]], <2 x float> [[VCALE_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALE_V2_I]]
 uint32x2_t test_vcale_f32(float32x2_t a, float32x2_t b) {
   return vcale_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaleq_f32
-// CHECK: vacge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALEQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALEQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacge.v4i32.v4f32(<4 x float> [[VCALEQ_V_I]], <4 x float> [[VCALEQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALEQ_V2_I]]
 uint32x4_t test_vcaleq_f32(float32x4_t a, float32x4_t b) {
   return vcaleq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcalt_f32
-// CHECK: vacgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcalt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VCALT_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VCALT_V1_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCALT_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vacgt.v2i32.v2f32(<2 x float> [[VCALT_V_I]], <2 x float> [[VCALT_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VCALT_V2_I]]
 uint32x2_t test_vcalt_f32(float32x2_t a, float32x2_t b) {
   return vcalt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcaltq_f32
-// CHECK: vacgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcaltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VCALTQ_V_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VCALTQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCALTQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vacgt.v4i32.v4f32(<4 x float> [[VCALTQ_V_I]], <4 x float> [[VCALTQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VCALTQ_V2_I]]
 uint32x4_t test_vcaltq_f32(float32x4_t a, float32x4_t b) {
   return vcaltq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vceq_s8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_s8(int8x8_t a, int8x8_t b) {
   return vceq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_s16(int16x4_t a, int16x4_t b) {
   return vceq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_s32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_s32(int32x2_t a, int32x2_t b) {
   return vceq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_f32
-// CHECK: vceq.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_f32(float32x2_t a, float32x2_t b) {
   return vceq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_u8(uint8x8_t a, uint8x8_t b) {
   return vceq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u16
-// CHECK: vceq.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vceq_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vceq_u16(uint16x4_t a, uint16x4_t b) {
   return vceq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceq_u32
-// CHECK: vceq.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vceq_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vceq_u32(uint32x2_t a, uint32x2_t b) {
   return vceq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceq_p8
-// CHECK: vceq.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vceq_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vceq_p8(poly8x8_t a, poly8x8_t b) {
   return vceq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_s8(int8x16_t a, int8x16_t b) {
   return vceqq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_s16(int16x8_t a, int16x8_t b) {
   return vceqq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_s32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_s32(int32x4_t a, int32x4_t b) {
   return vceqq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_f32
-// CHECK: vceq.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oeq <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_f32(float32x4_t a, float32x4_t b) {
   return vceqq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_u8(uint8x16_t a, uint8x16_t b) {
   return vceqq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u16
-// CHECK: vceq.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vceqq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vceqq_u16(uint16x8_t a, uint16x8_t b) {
   return vceqq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_u32
-// CHECK: vceq.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vceqq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vceqq_u32(uint32x4_t a, uint32x4_t b) {
   return vceqq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vceqq_p8
-// CHECK: vceq.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vceqq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp eq <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vceqq_p8(poly8x16_t a, poly8x16_t b) {
   return vceqq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vcge_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_s8(int8x8_t a, int8x8_t b) {
   return vcge_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_s16(int16x4_t a, int16x4_t b) {
   return vcge_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_s32(int32x2_t a, int32x2_t b) {
   return vcge_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_f32(float32x2_t a, float32x2_t b) {
   return vcge_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcge_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcge_u8(uint8x8_t a, uint8x8_t b) {
   return vcge_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcge_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcge_u16(uint16x4_t a, uint16x4_t b) {
   return vcge_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcge_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcge_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcge_u32(uint32x2_t a, uint32x2_t b) {
   return vcge_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_s8(int8x16_t a, int8x16_t b) {
   return vcgeq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_s16(int16x8_t a, int16x8_t b) {
   return vcgeq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_s32(int32x4_t a, int32x4_t b) {
   return vcgeq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp oge <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_f32(float32x4_t a, float32x4_t b) {
   return vcgeq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgeq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgeq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgeq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgeq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgeq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgeq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgeq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgeq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp uge <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgeq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgeq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcgt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_s8(int8x8_t a, int8x8_t b) {
   return vcgt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_s16(int16x4_t a, int16x4_t b) {
   return vcgt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_s32(int32x2_t a, int32x2_t b) {
   return vcgt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_f32(float32x2_t a, float32x2_t b) {
   return vcgt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcgt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcgt_u8(uint8x8_t a, uint8x8_t b) {
   return vcgt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcgt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcgt_u16(uint16x4_t a, uint16x4_t b) {
   return vcgt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcgt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcgt_u32(uint32x2_t a, uint32x2_t b) {
   return vcgt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_s8(int8x16_t a, int8x16_t b) {
   return vcgtq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_s16(int16x8_t a, int16x8_t b) {
   return vcgtq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sgt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_s32(int32x4_t a, int32x4_t b) {
   return vcgtq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ogt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_f32(float32x4_t a, float32x4_t b) {
   return vcgtq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcgtq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcgtq_u8(uint8x16_t a, uint8x16_t b) {
   return vcgtq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcgtq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcgtq_u16(uint16x8_t a, uint16x8_t b) {
   return vcgtq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcgtq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcgtq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ugt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcgtq_u32(uint32x4_t a, uint32x4_t b) {
   return vcgtq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcle_s8
-// CHECK: vcge.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_s8(int8x8_t a, int8x8_t b) {
   return vcle_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s16
-// CHECK: vcge.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_s16(int16x4_t a, int16x4_t b) {
   return vcle_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_s32
-// CHECK: vcge.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_s32(int32x2_t a, int32x2_t b) {
   return vcle_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_f32
-// CHECK: vcge.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_f32(float32x2_t a, float32x2_t b) {
   return vcle_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u8
-// CHECK: vcge.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcle_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vcle_u8(uint8x8_t a, uint8x8_t b) {
   return vcle_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u16
-// CHECK: vcge.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcle_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vcle_u16(uint16x4_t a, uint16x4_t b) {
   return vcle_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcle_u32
-// CHECK: vcge.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcle_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vcle_u32(uint32x2_t a, uint32x2_t b) {
   return vcle_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s8
-// CHECK: vcge.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_s8(int8x16_t a, int8x16_t b) {
   return vcleq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s16
-// CHECK: vcge.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_s16(int16x8_t a, int16x8_t b) {
   return vcleq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_s32
-// CHECK: vcge.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp sle <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_s32(int32x4_t a, int32x4_t b) {
   return vcleq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_f32
-// CHECK: vcge.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp ole <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_f32(float32x4_t a, float32x4_t b) {
   return vcleq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u8
-// CHECK: vcge.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcleq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcleq_u8(uint8x16_t a, uint8x16_t b) {
   return vcleq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u16
-// CHECK: vcge.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcleq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcleq_u16(uint16x8_t a, uint16x8_t b) {
   return vcleq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcleq_u32
-// CHECK: vcge.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcleq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ule <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcleq_u32(uint32x4_t a, uint32x4_t b) {
   return vcleq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vcls_s8
-// CHECK: vcls.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcls_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCLS_V_I]]
 int8x8_t test_vcls_s8(int8x8_t a) {
   return vcls_s8(a);
 }
 
-// CHECK-LABEL: test_vcls_s16
-// CHECK: vcls.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vcls_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <4 x i16> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vcls_s16(int16x4_t a) {
   return vcls_s16(a);
 }
 
-// CHECK-LABEL: test_vcls_s32
-// CHECK: vcls.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcls_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> [[VCLS_V_I]]) #4
+// CHECK:   [[VCLS_V2_I:%.*]] = bitcast <2 x i32> [[VCLS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vcls_s32(int32x2_t a) {
   return vcls_s32(a);
 }
 
-// CHECK-LABEL: test_vclsq_s8
-// CHECK: vcls.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCLSQ_V_I]]
 int8x16_t test_vclsq_s8(int8x16_t a) {
   return vclsq_s8(a);
 }
 
-// CHECK-LABEL: test_vclsq_s16
-// CHECK: vcls.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclsq_s16(int16x8_t a) {
   return vclsq_s16(a);
 }
 
-// CHECK-LABEL: test_vclsq_s32
-// CHECK: vcls.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> [[VCLSQ_V_I]]) #4
+// CHECK:   [[VCLSQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclsq_s32(int32x4_t a) {
   return vclsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vclt_s8
-// CHECK: vcgt.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_s8(int8x8_t a, int8x8_t b) {
   return vclt_s8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s16
-// CHECK: vcgt.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_s16(int16x4_t a, int16x4_t b) {
   return vclt_s16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_s32
-// CHECK: vcgt.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_s32(int32x2_t a, int32x2_t b) {
   return vclt_s32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_f32
-// CHECK: vcgt.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <2 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_f32(float32x2_t a, float32x2_t b) {
   return vclt_f32(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u8
-// CHECK: vcgt.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclt_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[SEXT_I]]
 uint8x8_t test_vclt_u8(uint8x8_t a, uint8x8_t b) {
   return vclt_u8(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u16
-// CHECK: vcgt.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclt_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[SEXT_I]]
 uint16x4_t test_vclt_u16(uint16x4_t a, uint16x4_t b) {
   return vclt_u16(a, b);
 }
 
-// CHECK-LABEL: test_vclt_u32
-// CHECK: vcgt.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclt_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <2 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[SEXT_I]]
 uint32x2_t test_vclt_u32(uint32x2_t a, uint32x2_t b) {
   return vclt_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s8
-// CHECK: vcgt.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_s8(int8x16_t a, int8x16_t b) {
   return vcltq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s16
-// CHECK: vcgt.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_s16(int16x8_t a, int16x8_t b) {
   return vcltq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_s32
-// CHECK: vcgt.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp slt <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_s32(int32x4_t a, int32x4_t b) {
   return vcltq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_f32
-// CHECK: vcgt.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = fcmp olt <4 x float> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_f32(float32x4_t a, float32x4_t b) {
   return vcltq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u8
-// CHECK: vcgt.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcltq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <16 x i8> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[SEXT_I]]
 uint8x16_t test_vcltq_u8(uint8x16_t a, uint8x16_t b) {
   return vcltq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u16
-// CHECK: vcgt.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcltq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <8 x i16> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[SEXT_I]]
 uint16x8_t test_vcltq_u16(uint16x8_t a, uint16x8_t b) {
   return vcltq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcltq_u32
-// CHECK: vcgt.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcltq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[CMP_I:%.*]] = icmp ult <4 x i32> %a, %b
+// CHECK:   [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[SEXT_I]]
 uint32x4_t test_vcltq_u32(uint32x4_t a, uint32x4_t b) {
   return vcltq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vclz_s8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vclz_s8(int8x8_t a) {
   return vclz_s8(a);
 }
 
-// CHECK-LABEL: test_vclz_s16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vclz_s16(int16x4_t a) {
   return vclz_s16(a);
 }
 
-// CHECK-LABEL: test_vclz_s32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vclz_s32(int32x2_t a) {
   return vclz_s32(a);
 }
 
-// CHECK-LABEL: test_vclz_u8
-// CHECK: vclz.i8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vclz_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vclz_u8(uint8x8_t a) {
   return vclz_u8(a);
 }
 
-// CHECK-LABEL: test_vclz_u16
-// CHECK: vclz.i16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vclz_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vclz_u16(uint16x4_t a) {
   return vclz_u16(a);
 }
 
-// CHECK-LABEL: test_vclz_u32
-// CHECK: vclz.i32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vclz_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vclz_u32(uint32x2_t a) {
   return vclz_u32(a);
 }
 
-// CHECK-LABEL: test_vclzq_s8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 int8x16_t test_vclzq_s8(int8x16_t a) {
   return vclzq_s8(a);
 }
 
-// CHECK-LABEL: test_vclzq_s16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vclzq_s16(int16x8_t a) {
   return vclzq_s16(a);
 }
 
-// CHECK-LABEL: test_vclzq_s32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vclzq_s32(int32x4_t a) {
   return vclzq_s32(a);
 }
 
-// CHECK-LABEL: test_vclzq_u8
-// CHECK: vclz.i8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vclzq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCLZQ_V_I:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) #4
+// CHECK:   ret <16 x i8> [[VCLZQ_V_I]]
 uint8x16_t test_vclzq_u8(uint8x16_t a) {
   return vclzq_u8(a);
 }
 
-// CHECK-LABEL: test_vclzq_u16
-// CHECK: vclz.i16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vclzq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <8 x i16> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 uint16x8_t test_vclzq_u16(uint16x8_t a) {
   return vclzq_u16(a);
 }
 
-// CHECK-LABEL: test_vclzq_u32
-// CHECK: vclz.i32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vclzq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCLZQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCLZQ_V1_I:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[VCLZQ_V_I]], i1 false) #4
+// CHECK:   [[VCLZQ_V2_I:%.*]] = bitcast <4 x i32> [[VCLZQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCLZQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 uint32x4_t test_vclzq_u32(uint32x4_t a) {
   return vclzq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcnt_u8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 uint8x8_t test_vcnt_u8(uint8x8_t a) {
   return vcnt_u8(a);
 }
 
-// CHECK-LABEL: test_vcnt_s8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 int8x8_t test_vcnt_s8(int8x8_t a) {
   return vcnt_s8(a);
 }
 
-// CHECK-LABEL: test_vcnt_p8
-// CHECK: vcnt.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vcnt_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcnt_p8(poly8x8_t a) {
   return vcnt_p8(a);
 }
 
-// CHECK-LABEL: test_vcntq_u8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 uint8x16_t test_vcntq_u8(uint8x16_t a) {
   return vcntq_u8(a);
 }
 
-// CHECK-LABEL: test_vcntq_s8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 int8x16_t test_vcntq_s8(int8x16_t a) {
   return vcntq_s8(a);
 }
 
-// CHECK-LABEL: test_vcntq_p8
-// CHECK: vcnt.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcntq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VCNTQ_V_I:%.*]] = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VCNTQ_V_I]]
 poly8x16_t test_vcntq_p8(poly8x16_t a) {
   return vcntq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vcombine_s8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vcombine_s8(int8x8_t a, int8x8_t b) {
   return vcombine_s8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vcombine_s16(int16x4_t a, int16x4_t b) {
   return vcombine_s16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vcombine_s32(int32x2_t a, int32x2_t b) {
   return vcombine_s32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_s64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 int64x2_t test_vcombine_s64(int64x1_t a, int64x1_t b) {
   return vcombine_s64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x half> @test_vcombine_f16(<4 x half> %a, <4 x half> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x half> [[SHUFFLE_I]]
 float16x8_t test_vcombine_f16(float16x4_t a, float16x4_t b) {
   return vcombine_f16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_f32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcombine_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vcombine_f32(float32x2_t a, float32x2_t b) {
   return vcombine_f32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vcombine_u8(uint8x8_t a, uint8x8_t b) {
   return vcombine_u8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vcombine_u16(uint16x4_t a, uint16x4_t b) {
   return vcombine_u16(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u32
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcombine_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vcombine_u32(uint32x2_t a, uint32x2_t b) {
   return vcombine_u32(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_u64
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vcombine_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %b, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[SHUFFLE_I]]
 uint64x2_t test_vcombine_u64(uint64x1_t a, uint64x1_t b) {
   return vcombine_u64(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p8
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vcombine_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vcombine_p8(poly8x8_t a, poly8x8_t b) {
   return vcombine_p8(a, b);
 }
 
-// CHECK-LABEL: test_vcombine_p16
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
-// CHECK: vmov d{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vcombine_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vcombine_p16(poly16x4_t a, poly16x4_t b) {
   return vcombine_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vcreate_s8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_s8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 int8x8_t test_vcreate_s8(uint64_t a) {
   return vclz_s8(vcreate_s8(a));
 }
 
-// CHECK-LABEL: test_vcreate_s16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_s16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vcreate_s16(uint64_t a) {
   return vclz_s16(vcreate_s16(a));
 }
 
-// CHECK-LABEL: test_vcreate_s32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_s32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vcreate_s32(uint64_t a) {
   return vclz_s32(vcreate_s32(a));
 }
 
-// CHECK-LABEL: test_vcreate_f16
+// CHECK-LABEL: define <4 x half> @test_vcreate_f16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vcreate_f16(uint64_t a) {
   return vcreate_f16(a);
 }
 
-// CHECK-LABEL: test_vcreate_f32
+// CHECK-LABEL: define <2 x float> @test_vcreate_f32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vcreate_f32(uint64_t a) {
   return vcreate_f32(a);
 }
 
-// CHECK-LABEL: test_vcreate_u8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_u8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> [[TMP0]], i1 false) #4
+// CHECK:   ret <8 x i8> [[VCLZ_V_I]]
 uint8x8_t test_vcreate_u8(uint64_t a) {
   return vclz_s8(vcreate_u8(a));
 }
 
-// CHECK-LABEL: test_vcreate_u16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i16 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <4 x i16> @test_vcreate_u16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <4 x i16> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vcreate_u16(uint64_t a) {
   return vclz_s16(vcreate_u16(a));
 }
 
-// CHECK-LABEL: test_vcreate_u32
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vclz.i32 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <2 x i32> @test_vcreate_u32(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <2 x i32>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8>
+// CHECK:   [[VCLZ_V_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VCLZ_V1_I:%.*]] = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> [[VCLZ_V_I]], i1 false) #4
+// CHECK:   [[VCLZ_V2_I:%.*]] = bitcast <2 x i32> [[VCLZ_V1_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VCLZ_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vcreate_u32(uint64_t a) {
   return vclz_s32(vcreate_u32(a));
 }
@@ -1614,10145 +2401,21596 @@ uint32x2_t test_vcreate_u32(uint64_t a) {
 // We have two ways of lowering that.  Either with one 'vmov d, r, r' or
 // with two 'vmov d[],r'.  LLVM does the latter. We may want to be less
 // strict about the matching pattern if it starts causing problem.
-// CHECK-LABEL: test_vcreate_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_u64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vcreate_u64(uint64_t a) {
   uint64x1_t tmp = vcreate_u64(a);
   return vadd_u64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vcreate_p8
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
-// CHECK: vcnt.8 d{{[0-9]+}}, [[REG]]
+// CHECK-LABEL: define <8 x i8> @test_vcreate_p8(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <8 x i8>
+// CHECK:   [[VCNT_V_I:%.*]] = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> [[TMP0]]) #4
+// CHECK:   ret <8 x i8> [[VCNT_V_I]]
 poly8x8_t test_vcreate_p8(uint64_t a) {
   return vcnt_p8(vcreate_p8(a));
 }
 
-// CHECK-LABEL: test_vcreate_p16
-// CHECK: vmov [[REG:d[0-9]+]], r0, r1
+// CHECK-LABEL: define <4 x i16> @test_vcreate_p16(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <4 x i16>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x i16> [[TMP0]] to <8 x i8>
+// CHECK:   [[VBSL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], <8 x i8> [[TMP3]]) #4
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[VBSL_V_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP4]]
 poly16x4_t test_vcreate_p16(uint64_t a) {
   poly16x4_t tmp = vcreate_p16(a);
   return vbsl_p16(tmp, tmp, tmp);
 }
 
-// CHECK-LABEL: test_vcreate_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vcreate_s64(i64 %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64 %a to <1 x i64>
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[TMP0]], [[TMP0]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vcreate_s64(uint64_t a) {
   int64x1_t tmp = vcreate_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_vcvt_f16_f32
-// CHECK: vcvt.f16.f32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x half> @test_vcvt_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_F16_F32_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_F16_F321_I:%.*]] = call <4 x i16> @llvm.arm.neon.vcvtfp2hf(<4 x float> [[VCVT_F16_F32_I]]) #4
+// CHECK:   [[VCVT_F16_F322_I:%.*]] = bitcast <4 x i16> [[VCVT_F16_F321_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VCVT_F16_F322_I]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vcvt_f16_f32(float32x4_t a) {
   return vcvt_f16_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_s32(int32x2_t a) {
   return vcvt_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvt_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <2 x i32> [[TMP1]] to <2 x float>
+// CHECK:   ret <2 x float> [[VCVT_I]]
 float32x2_t test_vcvt_f32_u32(uint32x2_t a) {
   return vcvt_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = sitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_s32(int32x4_t a) {
   return vcvtq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_I:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x float>
+// CHECK:   ret <4 x float> [[VCVT_I]]
 float32x4_t test_vcvtq_f32_u32(uint32x4_t a) {
   return vcvtq_f32_u32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_f32_f16
-// CHECK: vcvt.f32.f16
+// CHECK-LABEL: define <4 x float> @test_vcvt_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   [[VCVT_F32_F16_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VCVT_F32_F161_I:%.*]] = call <4 x float> @llvm.arm.neon.vcvthf2fp(<4 x i16> [[VCVT_F32_F16_I]]) #4
+// CHECK:   [[VCVT_F32_F162_I:%.*]] = bitcast <4 x float> [[VCVT_F32_F161_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VCVT_F32_F162_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP1]]
 float32x4_t test_vcvt_f32_f16(float16x4_t a) {
   return vcvt_f32_f16(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_f32_s32
-// CHECK: vcvt.f32.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_s32(int32x2_t a) {
   return vcvt_n_f32_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvt_n_f32_u32
-// CHECK: vcvt.f32.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vcvt_n_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x float> [[VCVT_N1]]
 float32x2_t test_vcvt_n_f32_u32(uint32x2_t a) {
   return vcvt_n_f32_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_s32
-// CHECK: vcvt.f32.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_s32(int32x4_t a) {
   return vcvtq_n_f32_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vcvtq_n_f32_u32
-// CHECK: vcvt.f32.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vcvtq_n_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x float> [[VCVT_N1]]
 float32x4_t test_vcvtq_n_f32_u32(uint32x4_t a) {
   return vcvtq_n_f32_u32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 int32x2_t test_vcvt_n_s32_f32(float32x2_t a) {
   return vcvt_n_s32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 int32x4_t test_vcvtq_n_s32_f32(float32x4_t a) {
   return vcvtq_n_s32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_n_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_n_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> [[VCVT_N]], i32 1)
+// CHECK:   ret <2 x i32> [[VCVT_N1]]
 uint32x2_t test_vcvt_n_u32_f32(float32x2_t a) {
   return vcvt_n_u32_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vcvtq_n_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_n_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VCVT_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> [[VCVT_N]], i32 3)
+// CHECK:   ret <4 x i32> [[VCVT_N1]]
 uint32x4_t test_vcvtq_n_u32_f32(float32x4_t a) {
   return vcvtq_n_u32_f32(a, 3);
 }
 
 
-// CHECK-LABEL: test_vcvt_s32_f32
-// CHECK: vcvt.s32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCVT_I]]
 int32x2_t test_vcvt_s32_f32(float32x2_t a) {
   return vcvt_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_s32_f32
-// CHECK: vcvt.s32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCVT_I]]
 int32x4_t test_vcvtq_s32_f32(float32x4_t a) {
   return vcvtq_s32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vcvt_u32_f32
-// CHECK: vcvt.u32.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vcvt_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <2 x float> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VCVT_I]]
 uint32x2_t test_vcvt_u32_f32(float32x2_t a) {
   return vcvt_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vcvtq_u32_f32
-// CHECK: vcvt.u32.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vcvtq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VCVT_I:%.*]] = fptoui <4 x float> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VCVT_I]]
 uint32x4_t test_vcvtq_u32_f32(float32x4_t a) {
   return vcvtq_u32_f32(a);
 }
 
 
-// CHECK-LABEL: test_vdup_lane_u8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 uint8x8_t test_vdup_lane_u8(uint8x8_t a) {
   return vdup_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_u16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 uint16x4_t test_vdup_lane_u16(uint16x4_t a) {
   return vdup_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_u32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 uint32x2_t test_vdup_lane_u32(uint32x2_t a) {
   return vdup_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 int8x8_t test_vdup_lane_s8(int8x8_t a) {
   return vdup_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_s16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 int16x4_t test_vdup_lane_s16(int16x4_t a) {
   return vdup_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_s32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vdup_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE]]
 int32x2_t test_vdup_lane_s32(int32x2_t a) {
   return vdup_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_p8
-// CHECK: vdup.8 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i8> @test_vdup_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE]]
 poly8x8_t test_vdup_lane_p8(poly8x8_t a) {
   return vdup_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdup_lane_p16
-// CHECK: vdup.16 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vdup_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE]]
 poly16x4_t test_vdup_lane_p16(poly16x4_t a) {
   return vdup_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdup_lane_f32
-// CHECK: vdup.32 d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vdup_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE]]
 float32x2_t test_vdup_lane_f32(float32x2_t a) {
   return vdup_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 uint8x16_t test_vdupq_lane_u8(uint8x8_t a) {
   return vdupq_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 uint16x8_t test_vdupq_lane_u16(uint16x4_t a) {
   return vdupq_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 uint32x4_t test_vdupq_lane_u32(uint32x2_t a) {
   return vdupq_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 int8x16_t test_vdupq_lane_s8(int8x8_t a) {
   return vdupq_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 int16x8_t test_vdupq_lane_s16(int16x4_t a) {
   return vdupq_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vdupq_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[SHUFFLE]]
 int32x4_t test_vdupq_lane_s32(int32x2_t a) {
   return vdupq_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p8
-// CHECK: vdup.8 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <16 x i8> @test_vdupq_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK:   ret <16 x i8> [[SHUFFLE]]
 poly8x16_t test_vdupq_lane_p8(poly8x8_t a) {
   return vdupq_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vdupq_lane_p16
-// CHECK: vdup.16 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vdupq_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   ret <8 x i16> [[SHUFFLE]]
 poly16x8_t test_vdupq_lane_p16(poly16x4_t a) {
   return vdupq_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vdupq_lane_f32
-// CHECK: vdup.32 q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vdupq_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x float> [[SHUFFLE]]
 float32x4_t test_vdupq_lane_f32(float32x2_t a) {
   return vdupq_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vdup_lane_s64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 int64x1_t test_vdup_lane_s64(int64x1_t a) {
   return vdup_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdup_lane_u64
+// CHECK-LABEL: define <1 x i64> @test_vdup_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE]]
 uint64x1_t test_vdup_lane_u64(uint64x1_t a) {
   return vdup_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 int64x2_t test_vdupq_lane_s64(int64x1_t a) {
   return vdupq_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vdupq_lane_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vdupq_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[SHUFFLE]]
 uint64x2_t test_vdupq_lane_u64(uint64x1_t a) {
   return vdupq_lane_u64(a, 0);
 }
 
 
-// CHECK-LABEL: test_vdup_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_u8(i8 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vdup_n_u8(uint8_t a) {
   return vdup_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_u16(i16 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vdup_n_u16(uint16_t a) {
   return vdup_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_u32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vdup_n_u32(uint32_t a) {
   return vdup_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_s8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vdup_n_s8(int8_t a) {
   return vdup_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_s16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vdup_n_s16(int16_t a) {
   return vdup_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vdup_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vdup_n_s32(int32_t a) {
   return vdup_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vdup_n_p8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vdup_n_p8(poly8_t a) {
   return vdup_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdup_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vdup_n_p16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vdup_n_p16(poly16_t a) {
   return vdup_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdup_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vdup_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
 float16x4_t test_vdup_n_f16(float16_t *a) {
   return vdup_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdup_n_f32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x float> @test_vdup_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vdup_n_f32(float32_t a) {
   return vdup_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_u8(i8 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vdupq_n_u8(uint8_t a) {
   return vdupq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_u16(i16 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vdupq_n_u16(uint16_t a) {
   return vdupq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_u32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vdupq_n_u32(uint32_t a) {
   return vdupq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_s8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vdupq_n_s8(int8_t a) {
   return vdupq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_s16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vdupq_n_s16(int16_t a) {
   return vdupq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_s32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vdupq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vdupq_n_s32(int32_t a) {
   return vdupq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vdupq_n_p8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vdupq_n_p8(poly8_t a) {
   return vdupq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vdupq_n_p16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vdupq_n_p16(poly16_t a) {
   return vdupq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vdupq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
 float16x8_t test_vdupq_n_f16(float16_t *a) {
   return vdupq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vdupq_n_f32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x float> @test_vdupq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vdupq_n_f32(float32_t a) {
   return vdupq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vdup_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vdup_n_s64(int64_t a) {
   int64x1_t tmp = vdup_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdup_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <1 x i64> @test_vdup_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vdup_n_u64(uint64_t a) {
   int64x1_t tmp = vdup_n_u64(a);
   return vadd_s64(tmp, tmp);
 
 }
 
-// CHECK-LABEL: test_vdupq_n_s64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vdupq_n_s64(int64_t a) {
   int64x2_t tmp = vdupq_n_s64(a);
   return vaddq_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vdupq_n_u64
-// CHECK: vmov
+// CHECK-LABEL: define <2 x i64> @test_vdupq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> [[VECINIT1_I]], [[VECINIT1_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vdupq_n_u64(uint64_t a) {
   int64x2_t tmp = vdupq_n_u64(a);
   return vaddq_u64(tmp, tmp);
 }
 
 
-// CHECK-LABEL: test_veor_s8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 int8x8_t test_veor_s8(int8x8_t a, int8x8_t b) {
   return veor_s8(a, b);
 }
 
-// CHECK-LABEL: test_veor_s16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 int16x4_t test_veor_s16(int16x4_t a, int16x4_t b) {
   return veor_s16(a, b);
 }
 
-// CHECK-LABEL: test_veor_s32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 int32x2_t test_veor_s32(int32x2_t a, int32x2_t b) {
   return veor_s32(a, b);
 }
 
-// CHECK-LABEL: test_veor_s64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 int64x1_t test_veor_s64(int64x1_t a, int64x1_t b) {
   return veor_s64(a, b);
 }
 
-// CHECK-LABEL: test_veor_u8
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_veor_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[XOR_I]]
 uint8x8_t test_veor_u8(uint8x8_t a, uint8x8_t b) {
   return veor_u8(a, b);
 }
 
-// CHECK-LABEL: test_veor_u16
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_veor_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[XOR_I]]
 uint16x4_t test_veor_u16(uint16x4_t a, uint16x4_t b) {
   return veor_u16(a, b);
 }
 
-// CHECK-LABEL: test_veor_u32
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_veor_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[XOR_I]]
 uint32x2_t test_veor_u32(uint32x2_t a, uint32x2_t b) {
   return veor_u32(a, b);
 }
 
-// CHECK-LABEL: test_veor_u64
-// CHECK: veor d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_veor_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[XOR_I]]
 uint64x1_t test_veor_u64(uint64x1_t a, uint64x1_t b) {
   return veor_u64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 int8x16_t test_veorq_s8(int8x16_t a, int8x16_t b) {
   return veorq_s8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 int16x8_t test_veorq_s16(int16x8_t a, int16x8_t b) {
   return veorq_s16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 int32x4_t test_veorq_s32(int32x4_t a, int32x4_t b) {
   return veorq_s32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_s64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 int64x2_t test_veorq_s64(int64x2_t a, int64x2_t b) {
   return veorq_s64(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u8
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_veorq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[XOR_I]]
 uint8x16_t test_veorq_u8(uint8x16_t a, uint8x16_t b) {
   return veorq_u8(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u16
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_veorq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[XOR_I]]
 uint16x8_t test_veorq_u16(uint16x8_t a, uint16x8_t b) {
   return veorq_u16(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u32
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_veorq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[XOR_I]]
 uint32x4_t test_veorq_u32(uint32x4_t a, uint32x4_t b) {
   return veorq_u32(a, b);
 }
 
-// CHECK-LABEL: test_veorq_u64
-// CHECK: veor q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_veorq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[XOR_I:%.*]] = xor <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[XOR_I]]
 uint64x2_t test_veorq_u64(uint64x2_t a, uint64x2_t b) {
   return veorq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vext_s8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 int8x8_t test_vext_s8(int8x8_t a, int8x8_t b) {
   return vext_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_u8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 uint8x8_t test_vext_u8(uint8x8_t a, uint8x8_t b) {
   return vext_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_p8
-// CHECK: vext.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vext_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i8> [[VEXT]]
 poly8x8_t test_vext_p8(poly8x8_t a, poly8x8_t b) {
   return vext_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vext_s16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 int16x4_t test_vext_s16(int16x4_t a, int16x4_t b) {
   return vext_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_u16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 uint16x4_t test_vext_u16(uint16x4_t a, uint16x4_t b) {
   return vext_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_p16
-// CHECK: vext.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vext_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i16> [[VEXT]]
 poly16x4_t test_vext_p16(poly16x4_t a, poly16x4_t b) {
   return vext_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vext_s32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 int32x2_t test_vext_s32(int32x2_t a, int32x2_t b) {
   return vext_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_u32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vext_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i32> [[VEXT]]
 uint32x2_t test_vext_u32(uint32x2_t a, uint32x2_t b) {
   return vext_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vext_s64
+// CHECK-LABEL: define <1 x i64> @test_vext_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 int64x1_t test_vext_s64(int64x1_t a, int64x1_t b) {
   return vext_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_u64
+// CHECK-LABEL: define <1 x i64> @test_vext_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <1 x i64> [[TMP2]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[VEXT]]
 uint64x1_t test_vext_u64(uint64x1_t a, uint64x1_t b) {
   return vext_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vext_f32
-// CHECK: vext.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vext_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x float> [[VEXT]]
 float32x2_t test_vext_f32(float32x2_t a, float32x2_t b) {
   return vext_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_s8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 int8x16_t test_vextq_s8(int8x16_t a, int8x16_t b) {
   return vextq_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_u8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 uint8x16_t test_vextq_u8(uint8x16_t a, uint8x16_t b) {
   return vextq_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_p8
-// CHECK: vext.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vextq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VEXT:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+// CHECK:   ret <16 x i8> [[VEXT]]
 poly8x16_t test_vextq_p8(poly8x16_t a, poly8x16_t b) {
   return vextq_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vextq_s16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 int16x8_t test_vextq_s16(int16x8_t a, int16x8_t b) {
   return vextq_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_u16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 uint16x8_t test_vextq_u16(uint16x8_t a, uint16x8_t b) {
   return vextq_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_p16
-// CHECK: vext.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vextq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VEXT:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> [[TMP3]], <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+// CHECK:   ret <8 x i16> [[VEXT]]
 poly16x8_t test_vextq_p16(poly16x8_t a, poly16x8_t b) {
   return vextq_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vextq_s32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 int32x4_t test_vextq_s32(int32x4_t a, int32x4_t b) {
   return vextq_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_u32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vextq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x i32> [[VEXT]]
 uint32x4_t test_vextq_u32(uint32x4_t a, uint32x4_t b) {
   return vextq_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vextq_s64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 int64x2_t test_vextq_s64(int64x2_t a, int64x2_t b) {
   return vextq_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_u64
-// CHECK: {{vmov|vdup}}
+// CHECK-LABEL: define <2 x i64> @test_vextq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VEXT:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP3]], <2 x i32> <i32 1, i32 2>
+// CHECK:   ret <2 x i64> [[VEXT]]
 uint64x2_t test_vextq_u64(uint64x2_t a, uint64x2_t b) {
   return vextq_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vextq_f32
-// CHECK: vext.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vextq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VEXT:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+// CHECK:   ret <4 x float> [[VEXT]]
 float32x4_t test_vextq_f32(float32x4_t a, float32x4_t b) {
   return vextq_f32(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vfma_f32
-// CHECK: vfma.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vfma_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
 float32x2_t test_vfma_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vfma_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vfmaq_f32
-// CHECK: vfma.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vfmaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
 float32x4_t test_vfmaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vfmaq_f32(a, b, c);
 }
 
-
-// CHECK-LABEL: test_vget_high_s8
+// CHECK-LABEL: define <2 x float> @test_vfms_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> [[SUB_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %c to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[TMP6:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP3]]) #4
+// CHECK:   ret <2 x float> [[TMP6]]
+float32x2_t test_vfms_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
+  return vfms_f32(a, b, c);
+}
+
+// CHECK-LABEL: define <4 x float> @test_vfmsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> [[SUB_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %c to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[TMP6:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[TMP3]]) #4
+// CHECK:   ret <4 x float> [[TMP6]]
+float32x4_t test_vfmsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
+  return vfmsq_f32(a, b, c);
+}
+
+
+// CHECK-LABEL: define <8 x i8> @test_vget_high_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_high_s8(int8x16_t a) {
   return vget_high_s8(a);
 }
 
-// CHECK-LABEL: test_vget_high_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_high_s16(int16x8_t a) {
   return vget_high_s16(a);
 }
 
-// CHECK-LABEL: test_vget_high_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_high_s32(int32x4_t a) {
   return vget_high_s32(a);
 }
 
-// CHECK-LABEL: test_vget_high_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_high_s64(int64x2_t a) {
   return vget_high_s64(a);
 }
 
-// CHECK-LABEL: test_vget_high_f16
+// CHECK-LABEL: define <4 x half> @test_vget_high_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_high_f16(float16x8_t a) {
   return vget_high_f16(a);
 }
 
-// CHECK-LABEL: test_vget_high_f32
+// CHECK-LABEL: define <2 x float> @test_vget_high_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_high_f32(float32x4_t a) {
   return vget_high_f32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_high_u8(uint8x16_t a) {
   return vget_high_u8(a);
 }
 
-// CHECK-LABEL: test_vget_high_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_high_u16(uint16x8_t a) {
   return vget_high_u16(a);
 }
 
-// CHECK-LABEL: test_vget_high_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_high_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 2, i32 3>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_high_u32(uint32x4_t a) {
   return vget_high_u32(a);
 }
 
-// CHECK-LABEL: test_vget_high_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_high_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> <i32 1>
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_high_u64(uint64x2_t a) {
   return vget_high_u64(a);
 }
 
-// CHECK-LABEL: test_vget_high_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_high_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_high_p8(poly8x16_t a) {
   return vget_high_p8(a);
 }
 
-// CHECK-LABEL: test_vget_high_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_high_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_high_p16(poly16x8_t a) {
   return vget_high_p16(a);
 }
 
 
-// CHECK-LABEL: test_vget_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define zeroext i8 @test_vget_lane_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vget_lane_u8(uint8x8_t a) {
   return vget_lane_u8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define zeroext i16 @test_vget_lane_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vget_lane_u16(uint16x4_t a) {
   return vget_lane_u16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vget_lane_u32(uint32x2_t a) {
   return vget_lane_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define signext i8 @test_vget_lane_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vget_lane_s8(int8x8_t a) {
   return vget_lane_s8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define signext i16 @test_vget_lane_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vget_lane_s16(int16x4_t a) {
   return vget_lane_s16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: define i32 @test_vget_lane_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vget_lane_s32(int32x2_t a) {
   return vget_lane_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define signext i8 @test_vget_lane_p8(<8 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i8> %a, i32 7
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vget_lane_p8(poly8x8_t a) {
   return vget_lane_p8(a, 7);
 }
 
-// CHECK-LABEL: test_vget_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define signext i16 @test_vget_lane_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP1]], i32 3
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vget_lane_p16(poly16x4_t a) {
   return vget_lane_p16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vget_lane_f32(float32x2_t a) {
   return vget_lane_f32(a, 1);
 }
 
-// CHECK-LABEL: test_vget_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vget_lane_f16(<4 x half> %a) #0 {
+// CHECK:   [[__REINT_242:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT1_242:%.*]] = alloca i16, align 2
+// CHECK:   store <4 x half> %a, <4 x half>* [[__REINT_242]], align 8
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half>* [[__REINT_242]] to <4 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 8
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i16> [[TMP3]], i32 1
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_242]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_242]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vget_lane_f16(float16x4_t a) {
   return vget_lane_f16(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define zeroext i8 @test_vgetq_lane_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 uint8_t test_vgetq_lane_u8(uint8x16_t a) {
   return vgetq_lane_u8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define zeroext i16 @test_vgetq_lane_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 uint16_t test_vgetq_lane_u16(uint16x8_t a) {
   return vgetq_lane_u16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGET_LANE]]
 uint32_t test_vgetq_lane_u32(uint32x4_t a) {
   return vgetq_lane_u32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define signext i8 @test_vgetq_lane_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 int8_t test_vgetq_lane_s8(int8x16_t a) {
   return vgetq_lane_s8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define signext i16 @test_vgetq_lane_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 int16_t test_vgetq_lane_s16(int16x8_t a) {
   return vgetq_lane_s16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
+// CHECK:   ret i32 [[VGET_LANE]]
 int32_t test_vgetq_lane_s32(int32x4_t a) {
   return vgetq_lane_s32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define signext i8 @test_vgetq_lane_p8(<16 x i8> %a) #0 {
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <16 x i8> %a, i32 15
+// CHECK:   ret i8 [[VGET_LANE]]
 poly8_t test_vgetq_lane_p8(poly8x16_t a) {
   return vgetq_lane_p8(a, 15);
 }
 
-// CHECK-LABEL: test_vgetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define signext i16 @test_vgetq_lane_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
+// CHECK:   ret i16 [[VGET_LANE]]
 poly16_t test_vgetq_lane_p16(poly16x8_t a) {
   return vgetq_lane_p16(a, 7);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
+// CHECK:   ret float [[VGET_LANE]]
 float32_t test_vgetq_lane_f32(float32x4_t a) {
   return vgetq_lane_f32(a, 3);
 }
 
-// CHECK-LABEL: test_vgetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define float @test_vgetq_lane_f16(<8 x half> %a) #0 {
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
+// CHECK:   store i16 [[VGET_LANE]], i16* [[__REINT1_244]], align 2
+// CHECK:   [[TMP4:%.*]] = bitcast i16* [[__REINT1_244]] to half*
+// CHECK:   [[TMP5:%.*]] = load half, half* [[TMP4]], align 2
+// CHECK:   [[CONV:%.*]] = fpext half [[TMP5]] to float
+// CHECK:   ret float [[CONV]]
 float32_t test_vgetq_lane_f16(float16x8_t a) {
   return vgetq_lane_f16(a, 3);
 }
 
-// CHECK-LABEL: test_vget_lane_s64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vget_lane_s64(int64x1_t a) {
   return vget_lane_s64(a, 0);
 }
 
-// CHECK-LABEL: test_vget_lane_u64
 // The optimizer is able to remove all moves now.
+// CHECK-LABEL: define i64 @test_vget_lane_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <1 x i64> [[TMP1]], i32 0
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vget_lane_u64(uint64x1_t a) {
   return vget_lane_u64(a, 0);
 }
 
-// CHECK-LABEL: test_vgetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGET_LANE]]
 int64_t test_vgetq_lane_s64(int64x2_t a) {
   return vgetq_lane_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vgetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: define i64 @test_vgetq_lane_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VGET_LANE:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+// CHECK:   ret i64 [[VGET_LANE]]
 uint64_t test_vgetq_lane_u64(uint64x2_t a) {
   return vgetq_lane_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vget_low_s8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vget_low_s8(int8x16_t a) {
   return vget_low_s8(a);
 }
 
-// CHECK-LABEL: test_vget_low_s16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vget_low_s16(int16x8_t a) {
   return vget_low_s16(a);
 }
 
-// CHECK-LABEL: test_vget_low_s32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vget_low_s32(int32x4_t a) {
   return vget_low_s32(a);
 }
 
-// CHECK-LABEL: test_vget_low_s64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_s64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 int64x1_t test_vget_low_s64(int64x2_t a) {
   return vget_low_s64(a);
 }
 
-// CHECK-LABEL: test_vget_low_f16
+// CHECK-LABEL: define <4 x half> @test_vget_low_f16(<8 x half> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x half> %a, <8 x half> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x half> [[SHUFFLE_I]]
 float16x4_t test_vget_low_f16(float16x8_t a) {
   return vget_low_f16(a);
 }
 
-// CHECK-LABEL: test_vget_low_f32
+// CHECK-LABEL: define <2 x float> @test_vget_low_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vget_low_f32(float32x4_t a) {
   return vget_low_f32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vget_low_u8(uint8x16_t a) {
   return vget_low_u8(a);
 }
 
-// CHECK-LABEL: test_vget_low_u16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vget_low_u16(uint16x8_t a) {
   return vget_low_u16(a);
 }
 
-// CHECK-LABEL: test_vget_low_u32
+// CHECK-LABEL: define <2 x i32> @test_vget_low_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vget_low_u32(uint32x4_t a) {
   return vget_low_u32(a);
 }
 
-// CHECK-LABEL: test_vget_low_u64
+// CHECK-LABEL: define <1 x i64> @test_vget_low_u64(<2 x i64> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[SHUFFLE_I]]
 uint64x1_t test_vget_low_u64(uint64x2_t a) {
   return vget_low_u64(a);
 }
 
-// CHECK-LABEL: test_vget_low_p8
+// CHECK-LABEL: define <8 x i8> @test_vget_low_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vget_low_p8(poly8x16_t a) {
   return vget_low_p8(a);
 }
 
-// CHECK-LABEL: test_vget_low_p16
+// CHECK-LABEL: define <4 x i16> @test_vget_low_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vget_low_p16(poly16x8_t a) {
   return vget_low_p16(a);
 }
 
 
-// CHECK-LABEL: test_vhadd_s8
-// CHECK: vhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 int8x8_t test_vhadd_s8(int8x8_t a, int8x8_t b) {
   return vhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s16
-// CHECK: vhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhadd_s16(int16x4_t a, int16x4_t b) {
   return vhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_s32
-// CHECK: vhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhadd_s32(int32x2_t a, int32x2_t b) {
   return vhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u8
-// CHECK: vhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHADD_V_I]]
 uint8x8_t test_vhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u16
-// CHECK: vhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> [[VHADD_V_I]], <4 x i16> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <4 x i16> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhadd_u32
-// CHECK: vhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> [[VHADD_V_I]], <2 x i32> [[VHADD_V1_I]]) #4
+// CHECK:   [[VHADD_V3_I:%.*]] = bitcast <2 x i32> [[VHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s8
-// CHECK: vhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 int8x16_t test_vhaddq_s8(int8x16_t a, int8x16_t b) {
   return vhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s16
-// CHECK: vhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhaddq_s16(int16x8_t a, int16x8_t b) {
   return vhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_s32
-// CHECK: vhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhaddq_s32(int32x4_t a, int32x4_t b) {
   return vhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u8
-// CHECK: vhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHADDQ_V_I]]
 uint8x16_t test_vhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u16
-// CHECK: vhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> [[VHADDQ_V_I]], <8 x i16> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhaddq_u32
-// CHECK: vhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> [[VHADDQ_V_I]], <4 x i32> [[VHADDQ_V1_I]]) #4
+// CHECK:   [[VHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vhsub_s8
-// CHECK: vhsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 int8x8_t test_vhsub_s8(int8x8_t a, int8x8_t b) {
   return vhsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s16
-// CHECK: vhsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vhsub_s16(int16x4_t a, int16x4_t b) {
   return vhsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_s32
-// CHECK: vhsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vhsub_s32(int32x2_t a, int32x2_t b) {
   return vhsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u8
-// CHECK: vhsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vhsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VHSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VHSUB_V_I]]
 uint8x8_t test_vhsub_u8(uint8x8_t a, uint8x8_t b) {
   return vhsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u16
-// CHECK: vhsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vhsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> [[VHSUB_V_I]], <4 x i16> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <4 x i16> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vhsub_u16(uint16x4_t a, uint16x4_t b) {
   return vhsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsub_u32
-// CHECK: vhsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vhsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VHSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VHSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VHSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> [[VHSUB_V_I]], <2 x i32> [[VHSUB_V1_I]]) #4
+// CHECK:   [[VHSUB_V3_I:%.*]] = bitcast <2 x i32> [[VHSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VHSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vhsub_u32(uint32x2_t a, uint32x2_t b) {
   return vhsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s8
-// CHECK: vhsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 int8x16_t test_vhsubq_s8(int8x16_t a, int8x16_t b) {
   return vhsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s16
-// CHECK: vhsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vhsubq_s16(int16x8_t a, int16x8_t b) {
   return vhsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_s32
-// CHECK: vhsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vhsubq_s32(int32x4_t a, int32x4_t b) {
   return vhsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u8
-// CHECK: vhsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vhsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VHSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VHSUBQ_V_I]]
 uint8x16_t test_vhsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vhsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u16
-// CHECK: vhsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vhsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> [[VHSUBQ_V_I]], <8 x i16> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vhsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vhsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vhsubq_u32
-// CHECK: vhsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vhsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VHSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VHSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> [[VHSUBQ_V_I]], <4 x i32> [[VHSUBQ_V1_I]]) #4
+// CHECK:   [[VHSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VHSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VHSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vhsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vhsubq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vld1q_u8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_u8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 uint8x16_t test_vld1q_u8(uint8_t const * a) {
   return vld1q_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_u16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 uint16x8_t test_vld1q_u16(uint16_t const * a) {
   return vld1q_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_u32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x i32> [[VLD1]]
 uint32x4_t test_vld1q_u32(uint32_t const * a) {
   return vld1q_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_u64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i64> [[VLD1]]
 uint64x2_t test_vld1q_u64(uint64_t const * a) {
   return vld1q_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_s8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_s8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 int8x16_t test_vld1q_s8(int8_t const * a) {
   return vld1q_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_s16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 int16x8_t test_vld1q_s16(int16_t const * a) {
   return vld1q_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_s32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i32> @llvm.arm.neon.vld1.v4i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x i32> [[VLD1]]
 int32x4_t test_vld1q_s32(int32_t const * a) {
   return vld1q_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_s64
-// CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <2 x i64> @test_vld1q_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i64> @llvm.arm.neon.vld1.v2i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i64> [[VLD1]]
 int64x2_t test_vld1q_s64(int64_t const * a) {
   return vld1q_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_f16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x half> @test_vld1q_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP1]]
 float16x8_t test_vld1q_f16(float16_t const * a) {
   return vld1q_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_f32
-// CHECK: vld1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x float> @test_vld1q_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x float> @llvm.arm.neon.vld1.v4f32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <4 x float> [[VLD1]]
 float32x4_t test_vld1q_f32(float32_t const * a) {
   return vld1q_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_p8
-// CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_p8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <16 x i8> @llvm.arm.neon.vld1.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <16 x i8> [[VLD1]]
 poly8x16_t test_vld1q_p8(poly8_t const * a) {
   return vld1q_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_p16
-// CHECK: vld1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x i16> [[VLD1]]
 poly16x8_t test_vld1q_p16(poly16_t const * a) {
   return vld1q_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_u8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_u8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 uint8x8_t test_vld1_u8(uint8_t const * a) {
   return vld1_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_u16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 uint16x4_t test_vld1_u16(uint16_t const * a) {
   return vld1_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_u32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i32> [[VLD1]]
 uint32x2_t test_vld1_u32(uint32_t const * a) {
   return vld1_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_u64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <1 x i64> [[VLD1]]
 uint64x1_t test_vld1_u64(uint64_t const * a) {
   return vld1_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_s8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_s8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 int8x8_t test_vld1_s8(int8_t const * a) {
   return vld1_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_s16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 int16x4_t test_vld1_s16(int16_t const * a) {
   return vld1_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_s32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x i32> @test_vld1_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x i32> @llvm.arm.neon.vld1.v2i32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x i32> [[VLD1]]
 int32x2_t test_vld1_s32(int32_t const * a) {
   return vld1_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_s64
-// CHECK: vld1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define <1 x i64> @test_vld1_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <1 x i64> [[VLD1]]
 int64x1_t test_vld1_s64(int64_t const * a) {
   return vld1_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_f16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x half> @test_vld1_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP1]]
 float16x4_t test_vld1_f16(float16_t const * a) {
   return vld1_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_f32
-// CHECK: vld1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <2 x float> @test_vld1_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <2 x float> @llvm.arm.neon.vld1.v2f32.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   ret <2 x float> [[VLD1]]
 float32x2_t test_vld1_f32(float32_t const * a) {
   return vld1_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_p8
-// CHECK: vld1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_p8(i8* %a) #0 {
+// CHECK:   [[VLD1:%.*]] = call <8 x i8> @llvm.arm.neon.vld1.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   ret <8 x i8> [[VLD1]]
 poly8x8_t test_vld1_p8(poly8_t const * a) {
   return vld1_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_p16
-// CHECK: vld1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <4 x i16> @test_vld1_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x i16> [[VLD1]]
 poly16x4_t test_vld1_p16(poly16_t const * a) {
   return vld1_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 uint8x16_t test_vld1q_dup_u8(uint8_t const * a) {
   return vld1q_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 uint16x8_t test_vld1q_dup_u16(uint16_t const * a) {
   return vld1q_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 uint32x4_t test_vld1q_dup_u32(uint32_t const * a) {
   return vld1q_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 uint64x2_t test_vld1q_dup_u64(uint64_t const * a) {
   return vld1q_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 int8x16_t test_vld1q_dup_s8(int8_t const * a) {
   return vld1q_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 int16x8_t test_vld1q_dup_s16(int16_t const * a) {
   return vld1q_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i32> [[LANE]]
 int32x4_t test_vld1q_dup_s32(int32_t const * a) {
   return vld1q_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i64> [[LANE]]
 int64x2_t test_vld1q_dup_s64(int64_t const * a) {
   return vld1q_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP4]]
 float16x8_t test_vld1q_dup_f16(float16_t const * a) {
   return vld1q_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x float> [[LANE]]
 float32x4_t test_vld1q_dup_f32(float32_t const * a) {
   return vld1q_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> [[TMP1]], <16 x i32> zeroinitializer
+// CHECK:   ret <16 x i8> [[LANE]]
 poly8x16_t test_vld1q_dup_p8(poly8_t const * a) {
   return vld1q_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1q_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i16> [[LANE]]
 poly16x8_t test_vld1q_dup_p16(poly16_t const * a) {
   return vld1q_dup_p16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_u8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 uint8x8_t test_vld1_dup_u8(uint8_t const * a) {
   return vld1_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_u16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 uint16x4_t test_vld1_dup_u16(uint16_t const * a) {
   return vld1_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_u32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 uint32x2_t test_vld1_dup_u32(uint32_t const * a) {
   return vld1_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_u64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 uint64x1_t test_vld1_dup_u64(uint64_t const * a) {
   return vld1_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_s8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 int8x8_t test_vld1_dup_s8(int8_t const * a) {
   return vld1_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_s16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 int16x4_t test_vld1_dup_s16(int16_t const * a) {
   return vld1_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_dup_s32(i32* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x i32> [[LANE]]
 int32x2_t test_vld1_dup_s32(int32_t const * a) {
   return vld1_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_dup_s64(i64* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP2:%.*]] = load i64, i64* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <1 x i64> undef, i64 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP3]], <1 x i32> zeroinitializer
+// CHECK:   ret <1 x i64> [[LANE]]
 int64x1_t test_vld1_dup_s64(int64_t const * a) {
   return vld1_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_dup_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP4]]
 float16x4_t test_vld1_dup_f16(float16_t const * a) {
   return vld1_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_dup_f32(float* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP2:%.*]] = load float, float* [[TMP1]], align 4
+// CHECK:   [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> [[TMP3]], <2 x i32> zeroinitializer
+// CHECK:   ret <2 x float> [[LANE]]
 float32x2_t test_vld1_dup_f32(float32_t const * a) {
   return vld1_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_dup_p8(i8* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[TMP1:%.*]] = insertelement <8 x i8> undef, i8 [[TMP0]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x i8> [[LANE]]
 poly8x8_t test_vld1_dup_p8(poly8_t const * a) {
   return vld1_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld1_dup_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_dup_p16(i16* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x i16> [[LANE]]
 poly16x4_t test_vld1_dup_p16(poly16_t const * a) {
   return vld1_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld1q_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 uint8x16_t test_vld1q_lane_u8(uint8_t const * a, uint8x16_t b) {
   return vld1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 uint16x8_t test_vld1q_lane_u16(uint16_t const * a, uint16x8_t b) {
   return vld1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 uint32x4_t test_vld1q_lane_u32(uint32_t const * a, uint32x4_t b) {
   return vld1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
 uint64x2_t test_vld1q_lane_u64(uint64_t const * a, uint64x2_t b) {
   return vld1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 int8x16_t test_vld1q_lane_s8(int8_t const * a, int8x16_t b) {
   return vld1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 int16x8_t test_vld1q_lane_s16(int16_t const * a, int16x8_t b) {
   return vld1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i32 3
+// CHECK:   ret <4 x i32> [[VLD1_LANE]]
 int32x4_t test_vld1q_lane_s32(int32_t const * a, int32x4_t b) {
   return vld1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vld1.v1i64.p0i8(i8* [[TMP0]], i32 4)
+// CHECK:   [[VLD1Q_LANE:%.*]] = shufflevector <1 x i64> [[TMP3]], <1 x i64> [[TMP4]], <2 x i32> <i32 0, i32 1>
+// CHECK:   ret <2 x i64> [[VLD1Q_LANE]]
 int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
   return vld1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x half> @test_vld1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
+// CHECK:   ret <8 x half> [[TMP5]]
 float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
   return vld1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1q_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP4]], i32 3
+// CHECK:   ret <4 x float> [[VLD1_LANE]]
 float32x4_t test_vld1q_lane_f32(float32_t const * a, float32x4_t b) {
   return vld1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <16 x i8> @test_vld1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <16 x i8> %b, i8 [[TMP0]], i32 15
+// CHECK:   ret <16 x i8> [[VLD1_LANE]]
 poly8x16_t test_vld1q_lane_p8(poly8_t const * a, poly8x16_t b) {
   return vld1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vld1q_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <8 x i16> @test_vld1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
+// CHECK:   ret <8 x i16> [[VLD1_LANE]]
 poly16x8_t test_vld1q_lane_p16(poly16_t const * a, poly16x8_t b) {
   return vld1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 uint8x8_t test_vld1_lane_u8(uint8_t const * a, uint8x8_t b) {
   return vld1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_u16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 uint16x4_t test_vld1_lane_u16(uint16_t const * a, uint16x4_t b) {
   return vld1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_u32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 uint32x2_t test_vld1_lane_u32(uint32_t const * a, uint32x2_t b) {
   return vld1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_u64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 uint64x1_t test_vld1_lane_u64(uint64_t const * a, uint64x1_t b) {
   return vld1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_s8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 int8x8_t test_vld1_lane_s8(int8_t const * a, int8x8_t b) {
   return vld1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_s16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 int16x4_t test_vld1_lane_s16(int16_t const * a, int16x4_t b) {
   return vld1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_s32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   [[TMP4:%.*]] = load i32, i32* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[TMP4]], i32 1
+// CHECK:   ret <2 x i32> [[VLD1_LANE]]
 int32x2_t test_vld1_lane_s32(int32_t const * a, int32x2_t b) {
   return vld1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_s64
-// CHECK: {{ldr|vldr|vmov}}
+// CHECK-LABEL: define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   [[TMP4:%.*]] = load i64, i64* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <1 x i64> [[TMP2]], i64 [[TMP4]], i32 0
+// CHECK:   ret <1 x i64> [[VLD1_LANE]]
 int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
   return vld1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vld1_lane_f16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x half> @test_vld1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
+// CHECK:   ret <4 x half> [[TMP5]]
 float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
   return vld1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld1_lane_f32
-// CHECK: vld1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP4]], i32 1
+// CHECK:   ret <2 x float> [[VLD1_LANE]]
 float32x2_t test_vld1_lane_f32(float32_t const * a, float32x2_t b) {
   return vld1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld1_lane_p8
-// CHECK: vld1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define <8 x i8> @test_vld1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = load i8, i8* %a, align 1
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i8> %b, i8 [[TMP0]], i32 7
+// CHECK:   ret <8 x i8> [[VLD1_LANE]]
 poly8x8_t test_vld1_lane_p8(poly8_t const * a, poly8x8_t b) {
   return vld1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld1_lane_p16
-// CHECK: vld1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define <4 x i16> @test_vld1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
+// CHECK:   ret <4 x i16> [[VLD1_LANE]]
 poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
   return vld1_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld2q_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_u8(%struct.uint8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
   return vld2q_u8(a);
 }
 
-// CHECK-LABEL: test_vld2q_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
   return vld2q_u16(a);
 }
 
-// CHECK-LABEL: test_vld2q_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
   return vld2q_u32(a);
 }
 
-// CHECK-LABEL: test_vld2q_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_s8(%struct.int8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
   return vld2q_s8(a);
 }
 
-// CHECK-LABEL: test_vld2q_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x2_t test_vld2q_s16(int16_t const * a) {
   return vld2q_s16(a);
 }
 
-// CHECK-LABEL: test_vld2q_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_V]], { <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x2_t test_vld2q_s32(int32_t const * a) {
   return vld2q_s32(a);
 }
 
-// CHECK-LABEL: test_vld2q_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x2_t test_vld2q_f16(float16_t const * a) {
   return vld2q_f16(a);
 }
 
-// CHECK-LABEL: test_vld2q_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_V]], { <4 x float>, <4 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x2_t test_vld2q_f32(float32_t const * a) {
   return vld2q_f32(a);
 }
 
-// CHECK-LABEL: test_vld2q_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_p8(%struct.poly8x16x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8> } [[VLD2Q_V]], { <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
   return vld2q_p8(a);
 }
 
-// CHECK-LABEL: test_vld2q_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_V]], { <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x2_t test_vld2q_p16(poly16_t const * a) {
   return vld2q_p16(a);
 }
 
-// CHECK-LABEL: test_vld2_u8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x2_t test_vld2_u8(uint8_t const * a) {
   return vld2_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_u16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x2_t test_vld2_u16(uint16_t const * a) {
   return vld2_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_u32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x2_t test_vld2_u32(uint32_t const * a) {
   return vld2_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld2_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x2_t test_vld2_u64(uint64_t const * a) {
   return vld2_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_s8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x2_t test_vld2_s8(int8_t const * a) {
   return vld2_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_s16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x2_t test_vld2_s16(int16_t const * a) {
   return vld2_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_s32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_V]], { <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x2_t test_vld2_s32(int32_t const * a) {
   return vld2_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld2_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD2_V]], { <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x2_t test_vld2_s64(int64_t const * a) {
   return vld2_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_f16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x2_t test_vld2_f16(float16_t const * a) {
   return vld2_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_f32
-// CHECK: vld2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_V]], { <2 x float>, <2 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x2_t test_vld2_f32(float32_t const * a) {
   return vld2_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_p8
-// CHECK: vld2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_V]], { <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x2_t test_vld2_p8(poly8_t const * a) {
   return vld2_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_p16
-// CHECK: vld2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_V]], { <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x2_t test_vld2_p16(poly16_t const * a) {
   return vld2_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2_dup_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x2_t test_vld2_dup_u8(uint8_t const * a) {
   return vld2_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x2_t test_vld2_dup_u16(uint16_t const * a) {
   return vld2_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x2_t test_vld2_dup_u32(uint32_t const * a) {
   return vld2_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld2_dup_u64(%struct.uint64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x2_t test_vld2_dup_u64(uint64_t const * a) {
   return vld2_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x2_t test_vld2_dup_s8(int8_t const * a) {
   return vld2_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x2_t test_vld2_dup_s16(int16_t const * a) {
   return vld2_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[TMP5]], { <2 x i32>, <2 x i32> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x2_t test_vld2_dup_s32(int32_t const * a) {
   return vld2_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld2_dup_s64(%struct.int64x1x2_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x2_t test_vld2_dup_s64(int64_t const * a) {
   return vld2_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x2_t test_vld2_dup_f16(float16_t const * a) {
   return vld2_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[TMP5]], { <2 x float>, <2 x float> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x2_t test_vld2_dup_f32(float32_t const * a) {
   return vld2_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[TMP4]], { <8 x i8>, <8 x i8> }* [[TMP5]]
+// CHECK:   [[TMP6:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP6]], i8* [[TMP7]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x2_t test_vld2_dup_p8(poly8_t const * a) {
   return vld2_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld2_dup_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_dup_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[TMP5]], { <4 x i16>, <4 x i16> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
   return vld2_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld2q_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_u16(%struct.uint16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
   return vld2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_u32(%struct.uint32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
   return vld2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_s16(%struct.int16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
   return vld2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_s32(%struct.int32x4x2_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 3, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32> } [[VLD2Q_LANE_V]], { <4 x i32>, <4 x i32> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
   return vld2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_f16(%struct.float16x8x2_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
   return vld2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2q_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_f32(%struct.float32x4x2_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP9]], <4 x float> [[TMP10]], i32 3, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float> } [[VLD2Q_LANE_V]], { <4 x float>, <4 x float> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
   return vld2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2q_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2q_lane_p16(%struct.poly16x8x2_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP9]], <8 x i16> [[TMP10]], i32 7, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16> } [[VLD2Q_LANE_V]], { <8 x i16>, <8 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 32, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x2_t test_vld2q_lane_p16(poly16_t const * a, poly16x8x2_t b) {
   return vld2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_u8(%struct.uint8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x2_t test_vld2_lane_u8(uint8_t const * a, uint8x8x2_t b) {
   return vld2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_u16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_u16(%struct.uint16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x2_t test_vld2_lane_u16(uint16_t const * a, uint16x4x2_t b) {
   return vld2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_u32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_u32(%struct.uint32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x2_t test_vld2_lane_u32(uint32_t const * a, uint32x2x2_t b) {
   return vld2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_s8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_s8(%struct.int8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x2_t test_vld2_lane_s8(int8_t const * a, int8x8x2_t b) {
   return vld2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_s16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_s16(%struct.int16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x2_t test_vld2_lane_s16(int16_t const * a, int16x4x2_t b) {
   return vld2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_s32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_s32(%struct.int32x2x2_t* noalias sret %agg.result, i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], i32 1, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32> } [[VLD2_LANE_V]], { <2 x i32>, <2 x i32> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
   return vld2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_f16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_f16(%struct.float16x4x2_t* noalias sret %agg.result, half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
   return vld2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld2_lane_f32
-// CHECK: vld2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_f32(%struct.float32x2x2_t* noalias sret %agg.result, float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP9]], <2 x float> [[TMP10]], i32 1, i32 4)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float> } [[VLD2_LANE_V]], { <2 x float>, <2 x float> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x2_t test_vld2_lane_f32(float32_t const * a, float32x2x2_t b) {
   return vld2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld2_lane_p8
-// CHECK: vld2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_p8(%struct.poly8x8x2_t* noalias sret %agg.result, i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   [[TMP6:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8> } [[VLD2_LANE_V]], { <8 x i8>, <8 x i8> }* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x2_t test_vld2_lane_p8(poly8_t const * a, poly8x8x2_t b) {
   return vld2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld2_lane_p16
-// CHECK: vld2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld2_lane_p16(%struct.poly16x4x2_t* noalias sret %agg.result, i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP9]], <4 x i16> [[TMP10]], i32 3, i32 2)
+// CHECK:   [[TMP11:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16> } [[VLD2_LANE_V]], { <4 x i16>, <4 x i16> }* [[TMP11]]
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP13:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP12]], i8* [[TMP13]], i32 16, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
   return vld2_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld3q_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_u8(%struct.uint8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
   return vld3q_u8(a);
 }
 
-// CHECK-LABEL: test_vld3q_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
   return vld3q_u16(a);
 }
 
-// CHECK-LABEL: test_vld3q_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
   return vld3q_u32(a);
 }
 
-// CHECK-LABEL: test_vld3q_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_s8(%struct.int8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
   return vld3q_s8(a);
 }
 
-// CHECK-LABEL: test_vld3q_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x3_t test_vld3q_s16(int16_t const * a) {
   return vld3q_s16(a);
 }
 
-// CHECK-LABEL: test_vld3q_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x3_t test_vld3q_s32(int32_t const * a) {
   return vld3q_s32(a);
 }
 
-// CHECK-LABEL: test_vld3q_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x3_t test_vld3q_f16(float16_t const * a) {
   return vld3q_f16(a);
 }
 
-// CHECK-LABEL: test_vld3q_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x3_t test_vld3q_f32(float32_t const * a) {
   return vld3q_f32(a);
 }
 
-// CHECK-LABEL: test_vld3q_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_p8(%struct.poly8x16x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8> } [[VLD3Q_V]], { <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
   return vld3q_p8(a);
 }
 
-// CHECK-LABEL: test_vld3q_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld3q_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x3_t test_vld3q_p16(poly16_t const * a) {
   return vld3q_p16(a);
 }
 
-// CHECK-LABEL: test_vld3_u8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x3_t test_vld3_u8(uint8_t const * a) {
   return vld3_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_u16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x3_t test_vld3_u16(uint16_t const * a) {
   return vld3_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_u32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x3_t test_vld3_u32(uint32_t const * a) {
   return vld3_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld3_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x3_t test_vld3_u64(uint64_t const * a) {
   return vld3_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_s8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x3_t test_vld3_s8(int8_t const * a) {
   return vld3_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_s16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x3_t test_vld3_s16(int16_t const * a) {
   return vld3_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_s32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x3_t test_vld3_s32(int32_t const * a) {
   return vld3_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld3_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD3_V]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x3_t test_vld3_s64(int64_t const * a) {
   return vld3_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_f16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x3_t test_vld3_f16(float16_t const * a) {
   return vld3_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_f32
-// CHECK: vld3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x3_t test_vld3_f32(float32_t const * a) {
   return vld3_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_p8
-// CHECK: vld3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x3_t test_vld3_p8(poly8_t const * a) {
   return vld3_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_p16
-// CHECK: vld3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x3_t test_vld3_p16(poly16_t const * a) {
   return vld3_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3_dup_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x3_t test_vld3_dup_u8(uint8_t const * a) {
   return vld3_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x3_t test_vld3_dup_u16(uint16_t const * a) {
   return vld3_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x3_t test_vld3_dup_u32(uint32_t const * a) {
   return vld3_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld3_dup_u64(%struct.uint64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x3_t test_vld3_dup_u64(uint64_t const * a) {
   return vld3_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x3_t test_vld3_dup_s8(int8_t const * a) {
   return vld3_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x3_t test_vld3_dup_s16(int16_t const * a) {
   return vld3_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x3_t test_vld3_dup_s32(int32_t const * a) {
   return vld3_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld3_dup_s64(%struct.int64x1x3_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x3_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x3_t test_vld3_dup_s64(int64_t const * a) {
   return vld3_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x3_t test_vld3_dup_f16(float16_t const * a) {
   return vld3_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[TMP7]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x3_t test_vld3_dup_f32(float32_t const * a) {
   return vld3_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x3_t test_vld3_dup_p8(poly8_t const * a) {
   return vld3_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld3_dup_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_dup_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
   return vld3_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld3q_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_u16(%struct.uint16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
   return vld3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_u32(%struct.uint32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
   return vld3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_s16(%struct.int16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
   return vld3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_s32(%struct.int32x4x3_t* noalias sret %agg.result, i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], i32 3, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32> } [[VLD3Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
   return vld3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_f16(%struct.float16x8x3_t* noalias sret %agg.result, half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
   return vld3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3q_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_f32(%struct.float32x4x3_t* noalias sret %agg.result, float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], i32 3, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float> } [[VLD3Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
   return vld3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3q_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld3q_lane_p16(%struct.poly16x8x3_t* noalias sret %agg.result, i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], i32 7, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16> } [[VLD3Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 48, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x3_t test_vld3q_lane_p16(poly16_t const * a, poly16x8x3_t b) {
   return vld3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_u8(%struct.uint8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x3_t test_vld3_lane_u8(uint8_t const * a, uint8x8x3_t b) {
   return vld3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_u16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_u16(%struct.uint16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x3_t test_vld3_lane_u16(uint16_t const * a, uint16x4x3_t b) {
   return vld3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_u32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_u32(%struct.uint32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.uint32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x3_t test_vld3_lane_u32(uint32_t const * a, uint32x2x3_t b) {
   return vld3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_s8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_s8(%struct.int8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x3_t test_vld3_lane_s8(int8_t const * a, int8x8x3_t b) {
   return vld3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_s16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_s16(%struct.int16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x3_t test_vld3_lane_s16(int16_t const * a, int16x4x3_t b) {
   return vld3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_s32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_s32(%struct.int32x2x3_t* noalias sret %agg.result, i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], i32 1, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32> } [[VLD3_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.int32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
   return vld3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_f16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_f16(%struct.float16x4x3_t* noalias sret %agg.result, half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
   return vld3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld3_lane_f32
-// CHECK: vld3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_f32(%struct.float32x2x3_t* noalias sret %agg.result, float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], i32 1, i32 4)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float> } [[VLD3_LANE_V]], { <2 x float>, <2 x float>, <2 x float> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.float32x2x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x3_t test_vld3_lane_f32(float32_t const * a, float32x2x3_t b) {
   return vld3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld3_lane_p8
-// CHECK: vld3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_p8(%struct.poly8x8x3_t* noalias sret %agg.result, i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   [[TMP7:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8> } [[VLD3_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP7]]
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly8x8x3_t* %agg.result to i8*
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP8]], i8* [[TMP9]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x3_t test_vld3_lane_p8(poly8_t const * a, poly8x8x3_t b) {
   return vld3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld3_lane_p16
-// CHECK: vld3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld3_lane_p16(%struct.poly16x4x3_t* noalias sret %agg.result, i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], i32 3, i32 2)
+// CHECK:   [[TMP14:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16> } [[VLD3_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP14]]
+// CHECK:   [[TMP15:%.*]] = bitcast %struct.poly16x4x3_t* %agg.result to i8*
+// CHECK:   [[TMP16:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP15]], i8* [[TMP16]], i32 24, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
   return vld3_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vld4q_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_u8(%struct.uint8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
   return vld4q_u8(a);
 }
 
-// CHECK-LABEL: test_vld4q_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
   return vld4q_u16(a);
 }
 
-// CHECK-LABEL: test_vld4q_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
   return vld4q_u32(a);
 }
 
-// CHECK-LABEL: test_vld4q_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_s8(%struct.int8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
   return vld4q_s8(a);
 }
 
-// CHECK-LABEL: test_vld4q_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x4_t test_vld4q_s16(int16_t const * a) {
   return vld4q_s16(a);
 }
 
-// CHECK-LABEL: test_vld4q_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x4_t test_vld4q_s32(int32_t const * a) {
   return vld4q_s32(a);
 }
 
-// CHECK-LABEL: test_vld4q_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x4_t test_vld4q_f16(float16_t const * a) {
   return vld4q_f16(a);
 }
 
-// CHECK-LABEL: test_vld4q_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x4_t test_vld4q_f32(float32_t const * a) {
   return vld4q_f32(a);
 }
 
-// CHECK-LABEL: test_vld4q_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_p8(%struct.poly8x16x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }*
+// CHECK:   store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[VLD4Q_V]], { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
   return vld4q_p8(a);
 }
 
-// CHECK-LABEL: test_vld4q_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vld4q_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x4_t test_vld4q_p16(poly16_t const * a) {
   return vld4q_p16(a);
 }
 
-// CHECK-LABEL: test_vld4_u8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x4_t test_vld4_u8(uint8_t const * a) {
   return vld4_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_u16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x4_t test_vld4_u16(uint16_t const * a) {
   return vld4_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_u32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x4_t test_vld4_u32(uint32_t const * a) {
   return vld4_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld4_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x4_t test_vld4_u64(uint64_t const * a) {
   return vld4_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_s8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x4_t test_vld4_s8(int8_t const * a) {
   return vld4_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_s16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x4_t test_vld4_s16(int16_t const * a) {
   return vld4_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_s32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x4_t test_vld4_s32(int32_t const * a) {
   return vld4_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld4_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD4_V]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x4_t test_vld4_s64(int64_t const * a) {
   return vld4_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_f16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x4_t test_vld4_f16(float16_t const * a) {
   return vld4_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_f32
-// CHECK: vld4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x4_t test_vld4_f32(float32_t const * a) {
   return vld4_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_p8
-// CHECK: vld4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8.p0i8(i8* %a, i32 1)
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP2]], i8* [[TMP3]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x4_t test_vld4_p8(poly8_t const * a) {
   return vld4_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_p16
-// CHECK: vld4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16.p0i8(i8* [[TMP1]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x4_t test_vld4_p16(poly16_t const * a) {
   return vld4_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4_dup_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x4_t test_vld4_dup_u8(uint8_t const * a) {
   return vld4_dup_u8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x4_t test_vld4_dup_u16(uint16_t const * a) {
   return vld4_dup_u16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x4_t test_vld4_dup_u32(uint32_t const * a) {
   return vld4_dup_u32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_u64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld4_dup_u64(%struct.uint64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint64x1x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint64x1x4_t test_vld4_dup_u64(uint64_t const * a) {
   return vld4_dup_u64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x4_t test_vld4_dup_s8(int8_t const * a) {
   return vld4_dup_s8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x4_t test_vld4_dup_s16(int16_t const * a) {
   return vld4_dup_s16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP1]], <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD_DUP]], <2 x i32> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP3]], <2 x i32> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP5]], <2 x i32> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP8]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP7]], <2 x i32> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[TMP9]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x4_t test_vld4_dup_s32(int32_t const * a) {
   return vld4_dup_s32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_s64
-// CHECK: vld1.64
+// CHECK-LABEL: define void @test_vld4_dup_s64(%struct.int64x1x4_t* noalias sret %agg.result, i64* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64.p0i8(i8* [[TMP1]], i32 4)
+// CHECK:   [[TMP2:%.*]] = bitcast i8* [[TMP0]] to { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }*
+// CHECK:   store { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } [[VLD_DUP]], { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> }* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int64x1x4_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int64x1x4_t test_vld4_dup_s64(int64_t const * a) {
   return vld4_dup_s64(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x4_t test_vld4_dup_f16(float16_t const * a) {
   return vld4_dup_f16(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP1]], <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> [[TMP2]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD_DUP]], <2 x float> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP4]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP3]], <2 x float> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> [[TMP6]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP5]], <2 x float> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP8]], <2 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP7]], <2 x float> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[TMP9]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x4_t test_vld4_dup_f32(float32_t const * a) {
   return vld4_dup_f32(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+// CHECK:   [[TMP1:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> [[TMP1]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP2:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD_DUP]], <8 x i8> [[LANE]], 0
+// CHECK:   [[TMP3:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <8 x i8> [[TMP3]], <8 x i8> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP4:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP2]], <8 x i8> [[LANE1]], 1
+// CHECK:   [[TMP5:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP5]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP6:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP4]], <8 x i8> [[LANE2]], 2
+// CHECK:   [[TMP7:%.*]] = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP7]], <8 x i32> zeroinitializer
+// CHECK:   [[TMP8:%.*]] = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP6]], <8 x i8> [[LANE3]], 3
+// CHECK:   [[TMP9:%.*]] = bitcast i8* [[TMP0]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[TMP8]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP9]]
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP10]], i8* [[TMP11]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x4_t test_vld4_dup_p8(poly8_t const * a) {
   return vld4_dup_p8(a);
 }
 
-// CHECK-LABEL: test_vld4_dup_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[], d{{[0-9]+}}[]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_dup_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a) #0 {
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VLD_DUP:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP1]], <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+// CHECK:   [[TMP2:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP2]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP3:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD_DUP]], <4 x i16> [[LANE]], 0
+// CHECK:   [[TMP4:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], 1
+// CHECK:   [[LANE1:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP4]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP5:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP3]], <4 x i16> [[LANE1]], 1
+// CHECK:   [[TMP6:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], 2
+// CHECK:   [[LANE2:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP6]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP7:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP5]], <4 x i16> [[LANE2]], 2
+// CHECK:   [[TMP8:%.*]] = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], 3
+// CHECK:   [[LANE3:%.*]] = shufflevector <4 x i16> [[TMP8]], <4 x i16> [[TMP8]], <4 x i32> zeroinitializer
+// CHECK:   [[TMP9:%.*]] = insertvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP7]], <4 x i16> [[LANE3]], 3
+// CHECK:   [[TMP10:%.*]] = bitcast i8* [[TMP0]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[TMP9]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP10]]
+// CHECK:   [[TMP11:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP12:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP11]], i8* [[TMP12]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
   return vld4_dup_p16(a);
 }
 
 
-// CHECK-LABEL: test_vld4q_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_u16(%struct.uint16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
   return vld4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_u32(%struct.uint32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
   return vld4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_s16(%struct.int16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
   return vld4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_s32(%struct.int32x4x4_t* noalias sret %agg.result, i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x i32>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32.p0i8(i8* [[TMP4]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], <4 x i32> [[TMP16]], i32 3, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }*
+// CHECK:   store { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLD4Q_LANE_V]], { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
   return vld4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_f16(%struct.float16x8x4_t* noalias sret %agg.result, half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
   return vld4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4q_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_f32(%struct.float32x4x4_t* noalias sret %agg.result, float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <4 x float>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <4 x float>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32.p0i8(i8* [[TMP4]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], <4 x float> [[TMP16]], i32 3, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x float>, <4 x float>, <4 x float>, <4 x float> }*
+// CHECK:   store { <4 x float>, <4 x float>, <4 x float>, <4 x float> } [[VLD4Q_LANE_V]], { <4 x float>, <4 x float>, <4 x float>, <4 x float> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
   return vld4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4q_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vld4q_lane_p16(%struct.poly16x8x4_t* noalias sret %agg.result, i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16.p0i8(i8* [[TMP4]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], <8 x i16> [[TMP16]], i32 7, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }*
+// CHECK:   store { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } [[VLD4Q_LANE_V]], { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 64, i32 16, i1 false)
+// CHECK:   ret void
 poly16x8x4_t test_vld4q_lane_p16(poly16_t const * a, poly16x8x4_t b) {
   return vld4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_u8(%struct.uint8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.uint8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint8x8x4_t test_vld4_lane_u8(uint8_t const * a, uint8x8x4_t b) {
   return vld4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_u16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_u16(%struct.uint16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint16x4x4_t test_vld4_lane_u16(uint16_t const * a, uint16x4x4_t b) {
   return vld4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_u32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_u32(%struct.uint32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.uint32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 uint32x2x4_t test_vld4_lane_u32(uint32_t const * a, uint32x2x4_t b) {
   return vld4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_s8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_s8(%struct.int8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.int8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int8x8x4_t test_vld4_lane_s8(int8_t const * a, int8x8x4_t b) {
   return vld4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_s16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_s16(%struct.int16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int16x4x4_t test_vld4_lane_s16(int16_t const * a, int16x4x4_t b) {
   return vld4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_s32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_s32(%struct.int32x2x4_t* noalias sret %agg.result, i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x i32> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x i32> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x i32> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x i32>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x i32>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32.p0i8(i8* [[TMP4]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], <2 x i32> [[TMP16]], i32 1, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }*
+// CHECK:   store { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } [[VLD4_LANE_V]], { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.int32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
   return vld4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_f16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_f16(%struct.float16x4x4_t* noalias sret %agg.result, half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x half> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
   return vld4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vld4_lane_f32
-// CHECK: vld4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_f32(%struct.float32x2x4_t* noalias sret %agg.result, float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <2 x float> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <2 x float> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <2 x float> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <2 x float> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <2 x float>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <2 x float>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32.p0i8(i8* [[TMP4]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], <2 x float> [[TMP16]], i32 1, i32 4)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <2 x float>, <2 x float>, <2 x float>, <2 x float> }*
+// CHECK:   store { <2 x float>, <2 x float>, <2 x float>, <2 x float> } [[VLD4_LANE_V]], { <2 x float>, <2 x float>, <2 x float>, <2 x float> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.float32x2x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 float32x2x4_t test_vld4_lane_f32(float32_t const * a, float32x2x4_t b) {
   return vld4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vld4_lane_p8
-// CHECK: vld4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_p8(%struct.poly8x8x4_t* noalias sret %agg.result, i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8.p0i8(i8* %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], i32 7, i32 1)
+// CHECK:   [[TMP8:%.*]] = bitcast i8* [[TMP3]] to { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }*
+// CHECK:   store { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } [[VLD4_LANE_V]], { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }* [[TMP8]]
+// CHECK:   [[TMP9:%.*]] = bitcast %struct.poly8x8x4_t* %agg.result to i8*
+// CHECK:   [[TMP10:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP9]], i8* [[TMP10]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly8x8x4_t test_vld4_lane_p8(poly8_t const * a, poly8x8x4_t b) {
   return vld4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vld4_lane_p16
-// CHECK: vld4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vld4_lane_p16(%struct.poly16x4x4_t* noalias sret %agg.result, i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP5:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP7:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP8:%.*]] = bitcast <4 x i16> [[TMP7]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP9:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP10:%.*]] = bitcast <4 x i16> [[TMP9]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP11:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP12:%.*]] = bitcast <4 x i16> [[TMP11]] to <8 x i8>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16.p0i8(i8* [[TMP4]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], <4 x i16> [[TMP16]], i32 3, i32 2)
+// CHECK:   [[TMP17:%.*]] = bitcast i8* [[TMP3]] to { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }*
+// CHECK:   store { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } [[VLD4_LANE_V]], { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }* [[TMP17]]
+// CHECK:   [[TMP18:%.*]] = bitcast %struct.poly16x4x4_t* %agg.result to i8*
+// CHECK:   [[TMP19:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP18]], i8* [[TMP19]], i32 32, i32 8, i1 false)
+// CHECK:   ret void
 poly16x4x4_t test_vld4_lane_p16(poly16_t const * a, poly16x4x4_t b) {
   return vld4_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vmax_s8
-// CHECK: vmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_V_I]]
 int8x8_t test_vmax_s8(int8x8_t a, int8x8_t b) {
   return vmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s16
-// CHECK: vmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vmax_s16(int16x4_t a, int16x4_t b) {
   return vmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_s32
-// CHECK: vmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vmax_s32(int32x2_t a, int32x2_t b) {
   return vmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u8
-// CHECK: vmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMAX_V_I]]
 uint8x8_t test_vmax_u8(uint8x8_t a, uint8x8_t b) {
   return vmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u16
-// CHECK: vmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> [[VMAX_V_I]], <4 x i16> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <4 x i16> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vmax_u16(uint16x4_t a, uint16x4_t b) {
   return vmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmax_u32
-// CHECK: vmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> [[VMAX_V_I]], <2 x i32> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x i32> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vmax_u32(uint32x2_t a, uint32x2_t b) {
   return vmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmax_f32
-// CHECK: vmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> [[VMAX_V_I]], <2 x float> [[VMAX_V1_I]]) #4
+// CHECK:   [[VMAX_V3_I:%.*]] = bitcast <2 x float> [[VMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMAX_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vmax_f32(float32x2_t a, float32x2_t b) {
   return vmax_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s8
-// CHECK: vmax.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 int8x16_t test_vmaxq_s8(int8x16_t a, int8x16_t b) {
   return vmaxq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s16
-// CHECK: vmax.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vmaxq_s16(int16x8_t a, int16x8_t b) {
   return vmaxq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_s32
-// CHECK: vmax.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vmaxq_s32(int32x4_t a, int32x4_t b) {
   return vmaxq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u8
-// CHECK: vmax.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmaxq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMAXQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMAXQ_V_I]]
 uint8x16_t test_vmaxq_u8(uint8x16_t a, uint8x16_t b) {
   return vmaxq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u16
-// CHECK: vmax.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmaxq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> [[VMAXQ_V_I]], <8 x i16> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <8 x i16> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vmaxq_u16(uint16x8_t a, uint16x8_t b) {
   return vmaxq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_u32
-// CHECK: vmax.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmaxq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> [[VMAXQ_V_I]], <4 x i32> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x i32> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vmaxq_u32(uint32x4_t a, uint32x4_t b) {
   return vmaxq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmaxq_f32
-// CHECK: vmax.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmaxq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMAXQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMAXQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMAXQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> [[VMAXQ_V_I]], <4 x float> [[VMAXQ_V1_I]]) #4
+// CHECK:   [[VMAXQ_V3_I:%.*]] = bitcast <4 x float> [[VMAXQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMAXQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vmaxq_f32(float32x4_t a, float32x4_t b) {
   return vmaxq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmin_s8
-// CHECK: vmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_V_I]]
 int8x8_t test_vmin_s8(int8x8_t a, int8x8_t b) {
   return vmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s16
-// CHECK: vmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vmin_s16(int16x4_t a, int16x4_t b) {
   return vmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_s32
-// CHECK: vmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vmin_s32(int32x2_t a, int32x2_t b) {
   return vmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u8
-// CHECK: vmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMIN_V_I]]
 uint8x8_t test_vmin_u8(uint8x8_t a, uint8x8_t b) {
   return vmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u16
-// CHECK: vmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> [[VMIN_V_I]], <4 x i16> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <4 x i16> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vmin_u16(uint16x4_t a, uint16x4_t b) {
   return vmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmin_u32
-// CHECK: vmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> [[VMIN_V_I]], <2 x i32> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x i32> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vmin_u32(uint32x2_t a, uint32x2_t b) {
   return vmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmin_f32
-// CHECK: vmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> [[VMIN_V_I]], <2 x float> [[VMIN_V1_I]]) #4
+// CHECK:   [[VMIN_V3_I:%.*]] = bitcast <2 x float> [[VMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VMIN_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vmin_f32(float32x2_t a, float32x2_t b) {
   return vmin_f32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s8
-// CHECK: vmin.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 int8x16_t test_vminq_s8(int8x16_t a, int8x16_t b) {
   return vminq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s16
-// CHECK: vmin.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vminq_s16(int16x8_t a, int16x8_t b) {
   return vminq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_s32
-// CHECK: vmin.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vminq_s32(int32x4_t a, int32x4_t b) {
   return vminq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u8
-// CHECK: vmin.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vminq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMINQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMINQ_V_I]]
 uint8x16_t test_vminq_u8(uint8x16_t a, uint8x16_t b) {
   return vminq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u16
-// CHECK: vmin.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vminq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> [[VMINQ_V_I]], <8 x i16> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <8 x i16> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vminq_u16(uint16x8_t a, uint16x8_t b) {
   return vminq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vminq_u32
-// CHECK: vmin.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vminq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> [[VMINQ_V_I]], <4 x i32> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x i32> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vminq_u32(uint32x4_t a, uint32x4_t b) {
   return vminq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vminq_f32
-// CHECK: vmin.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vminq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VMINQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VMINQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VMINQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> [[VMINQ_V_I]], <4 x float> [[VMINQ_V1_I]]) #4
+// CHECK:   [[VMINQ_V3_I:%.*]] = bitcast <4 x float> [[VMINQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VMINQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vminq_f32(float32x4_t a, float32x4_t b) {
   return vminq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmla_s8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 int8x8_t test_vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmla_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u8
-// CHECK: vmla.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmla_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[ADD_I]]
 uint8x8_t test_vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmla_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 int8x16_t test_vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlaq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlaq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlaq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlaq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u8
-// CHECK: vmla.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlaq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[ADD_I]]
 uint8x16_t test_vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlaq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlaq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlaq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_s8
-// CHECK: vmlal.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlal_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u8
-// CHECK: vmlal.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlal_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlal_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlal_lane_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlal_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlal_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlal_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlal_lane_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlal_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[ADD]]
 uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlal_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlal_n_s16
-// CHECK: vmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_s32
-// CHECK: vmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 int64x2_t test_vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlal_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u16
-// CHECK: vmlal.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlal_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlal_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlal_n_u32
-// CHECK: vmlal.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlal_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[ADD_I:%.*]] = add <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[ADD_I]]
 uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlal_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmla_lane_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmla_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmla_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmla_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[ADD]]
 uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmla_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmla_lane_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmla_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[ADD]]
 uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmla_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmla_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[ADD]]
 float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmla_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlaq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlaq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[ADD]]
 uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlaq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = add <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[ADD]]
 uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlaq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlaq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[ADD]]
 float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlaq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmla_n_s16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 int16x4_t test_vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmla_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_s32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 int32x2_t test_vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmla_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u16
-// CHECK: vmla.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmla_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[ADD_I]]
 uint16x4_t test_vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmla_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_u32
-// CHECK: vmla.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmla_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = add <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[ADD_I]]
 uint32x2_t test_vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmla_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmla_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vmla.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmla_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[ADD_I]]
 float32x2_t test_vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmla_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 int16x8_t test_vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlaq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_s32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 int32x4_t test_vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlaq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u16
-// CHECK: vmla.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlaq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[ADD_I:%.*]] = add <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[ADD_I]]
 uint16x8_t test_vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlaq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_u32
-// CHECK: vmla.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlaq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = add <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[ADD_I]]
 uint32x4_t test_vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlaq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlaq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vadd.f32
-// CHECK-A57: vld1.32 {d{{[0-9]+}}[], d{{[0-9]+}}[]}, 
-// CHECK-A57: vmla.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlaq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[ADD_I:%.*]] = fadd <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[ADD_I]]
 float32x4_t test_vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlaq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_s8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vmls_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u8
-// CHECK: vmls.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmls_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, [[MUL_I]]
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmls_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_s8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) {
   return vmlsq_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_s16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) {
   return vmlsq_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_s32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) {
   return vmlsq_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, %c
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) {
   return vmlsq_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u8
-// CHECK: vmls.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmlsq_u8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, [[MUL_I]]
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) {
   return vmlsq_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_u16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) {
   return vmlsq_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_u32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, %c
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) {
   return vmlsq_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_s8
-// CHECK: vmlsl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_s8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) {
   return vmlsl_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u8
-// CHECK: vmlsl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsl_u8(<8 x i16> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VMULL_I_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMULL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) {
   return vmlsl_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmlsl_lane_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vmlsl_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_lane_u16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmlsl_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsl_lane_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_lane_u32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]]
+// CHECK:   ret <2 x i64> [[SUB]]
 uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmlsl_lane_u32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmlsl_n_s16
-// CHECK: vmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_s32
-// CHECK: vmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vmlsl_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u16
-// CHECK: vmlsl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsl_n_u16(<4 x i32> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I_I]], <4 x i16> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMULL2_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) {
   return vmlsl_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsl_n_u32
-// CHECK: vmlsl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmlsl_n_u32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I_I]], <2 x i32> [[VMULL1_I_I]]) #4
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMULL2_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) {
   return vmlsl_n_u32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmls_lane_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_s16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) {
   return vmls_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_s32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) {
   return vmls_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmls_lane_u16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]]
+// CHECK:   ret <4 x i16> [[SUB]]
 uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) {
   return vmls_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmls_lane_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmls_lane_u32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]]
+// CHECK:   ret <2 x i32> [[SUB]]
 uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) {
   return vmls_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmls_lane_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]]
+// CHECK:   ret <2 x float> [[SUB]]
 float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) {
   return vmls_lane_f32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_s16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) {
   return vmlsq_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_s32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) {
   return vmlsq_lane_s32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_lane_u16(<8 x i16> %a, <8 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]]
+// CHECK:   ret <8 x i16> [[SUB]]
 uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) {
   return vmlsq_lane_u16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_lane_u32(<4 x i32> %a, <4 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]]
+// CHECK:   ret <4 x i32> [[SUB]]
 uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) {
   return vmlsq_lane_u32(a, b, c, 1);
 }
 
-// CHECK-LABEL: test_vmlsq_lane_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]]
+// CHECK:   [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]]
+// CHECK:   ret <4 x float> [[SUB]]
 float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) {
   return vmlsq_lane_f32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vmls_n_s16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_s16(<4 x i16> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) {
   return vmls_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_s32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_s32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) {
   return vmls_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u16
-// CHECK: vmls.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmls_n_u16(<4 x i16> %a, <4 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, [[MUL_I]]
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) {
   return vmls_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_u32
-// CHECK: vmls.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmls_n_u32(<2 x i32> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, [[MUL_I]]
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) {
   return vmls_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmls_n_f32
-// CHECK-SWIFT: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmls_n_f32(<2 x float> %a, <2 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %b, [[VECINIT1_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, [[MUL_I]]
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) {
   return vmls_n_f32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_s16(<8 x i16> %a, <8 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) {
   return vmlsq_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_s32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_s32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) {
   return vmlsq_n_s32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u16
-// CHECK: vmls.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmlsq_n_u16(<8 x i16> %a, <8 x i16> %b, i16 zeroext %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %c, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %c, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %c, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %c, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %b, [[VECINIT7_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[MUL_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) {
   return vmlsq_n_u16(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_u32
-// CHECK: vmls.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmlsq_n_u32(<4 x i32> %a, <4 x i32> %b, i32 %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[MUL_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) {
   return vmlsq_n_u32(a, b, c);
 }
 
-// CHECK-LABEL: test_vmlsq_n_f32
-// CHECK-SWIFT: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
-// CHECK-SWIFT: vsub.f32
-// CHECK-A57: vmls.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmlsq_n_f32(<4 x float> %a, <4 x float> %b, float %c) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %c, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %b, [[VECINIT3_I]]
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, [[MUL_I]]
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) {
   return vmlsq_n_f32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vmovl_s8
-// CHECK: vmovl.s8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 int16x8_t test_vmovl_s8(int8x8_t a) {
   return vmovl_s8(a);
 }
 
-// CHECK-LABEL: test_vmovl_s16
-// CHECK: vmovl.s16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 int32x4_t test_vmovl_s16(int16x4_t a) {
   return vmovl_s16(a);
 }
 
-// CHECK-LABEL: test_vmovl_s32
-// CHECK: vmovl.s32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 int64x2_t test_vmovl_s32(int32x2_t a) {
   return vmovl_s32(a);
 }
 
-// CHECK-LABEL: test_vmovl_u8
-// CHECK: vmovl.u8 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VMOVL_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[VMOVL_I]]
 uint16x8_t test_vmovl_u8(uint8x8_t a) {
   return vmovl_u8(a);
 }
 
-// CHECK-LABEL: test_vmovl_u16
-// CHECK: vmovl.u16 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VMOVL_I]]
 uint32x4_t test_vmovl_u16(uint16x4_t a) {
   return vmovl_u16(a);
 }
 
-// CHECK-LABEL: test_vmovl_u32
-// CHECK: vmovl.u32 q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[VMOVL_I]]
 uint64x2_t test_vmovl_u32(uint32x2_t a) {
   return vmovl_u32(a);
 }
 
 
-// CHECK-LABEL: test_vmovn_s16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 int8x8_t test_vmovn_s16(int16x8_t a) {
   return vmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vmovn_s32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 int16x4_t test_vmovn_s32(int32x4_t a) {
   return vmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vmovn_s64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 int32x2_t test_vmovn_s64(int64x2_t a) {
   return vmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vmovn_u16
-// CHECK: vmovn.i16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <8 x i16> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VMOVN_I]]
 uint8x8_t test_vmovn_u16(uint16x8_t a) {
   return vmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vmovn_u32
-// CHECK: vmovn.i32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <4 x i32> [[TMP1]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VMOVN_I]]
 uint16x4_t test_vmovn_u32(uint32x4_t a) {
   return vmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vmovn_u64
-// CHECK: vmovn.i64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VMOVN_I:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VMOVN_I]]
 uint32x2_t test_vmovn_u64(uint64x2_t a) {
   return vmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmov_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_u8(i8 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 uint8x8_t test_vmov_n_u8(uint8_t a) {
   return vmov_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_u16(i16 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 uint16x4_t test_vmov_n_u16(uint16_t a) {
   return vmov_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_u32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 uint32x2_t test_vmov_n_u32(uint32_t a) {
   return vmov_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_s8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 int8x8_t test_vmov_n_s8(int8_t a) {
   return vmov_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_s16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 int16x4_t test_vmov_n_s16(int16_t a) {
   return vmov_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmov_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VECINIT1_I]]
 int32x2_t test_vmov_n_s32(int32_t a) {
   return vmov_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmov_n_p8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VECINIT7_I]]
 poly8x8_t test_vmov_n_p8(poly8_t a) {
   return vmov_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmov_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmov_n_p16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VECINIT3_I]]
 poly16x4_t test_vmov_n_p16(poly16_t a) {
   return vmov_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmov_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\]}}}
+// CHECK-LABEL: define <4 x half> @test_vmov_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <4 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <4 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <4 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <4 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   ret <4 x half> [[VECINIT3]]
 float16x4_t test_vmov_n_f16(float16_t *a) {
   return vmov_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmov_n_f32
-// CHECK: mov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmov_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VECINIT1_I]]
 float32x2_t test_vmov_n_f32(float32_t a) {
   return vmov_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_u8(i8 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 uint8x16_t test_vmovq_n_u8(uint8_t a) {
   return vmovq_n_u8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_u16(i16 zeroext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 uint16x8_t test_vmovq_n_u16(uint16_t a) {
   return vmovq_n_u16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_u32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 uint32x4_t test_vmovq_n_u32(uint32_t a) {
   return vmovq_n_u32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_s8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 int8x16_t test_vmovq_n_s8(int8_t a) {
   return vmovq_n_s8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_s16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 int16x8_t test_vmovq_n_s16(int16_t a) {
   return vmovq_n_s16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_s32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmovq_n_s32(i32 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VECINIT3_I]]
 int32x4_t test_vmovq_n_s32(int32_t a) {
   return vmovq_n_s32(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p8
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmovq_n_p8(i8 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <16 x i8> undef, i8 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 %a, i32 7
+// CHECK:   [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 %a, i32 8
+// CHECK:   [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 %a, i32 9
+// CHECK:   [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 %a, i32 10
+// CHECK:   [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 %a, i32 11
+// CHECK:   [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 %a, i32 12
+// CHECK:   [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 %a, i32 13
+// CHECK:   [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 %a, i32 14
+// CHECK:   [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VECINIT15_I]]
 poly8x16_t test_vmovq_n_p8(poly8_t a) {
   return vmovq_n_p8(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_p16
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmovq_n_p16(i16 signext %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %a, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %a, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %a, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %a, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VECINIT7_I]]
 poly16x8_t test_vmovq_n_p16(poly16_t a) {
   return vmovq_n_p16(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f16
-// CHECK: vld1.16 {{{d[0-9]+\[\], d[0-9]+\[\]}}}
+// CHECK-LABEL: define <8 x half> @test_vmovq_n_f16(half* %a) #0 {
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   [[VECINIT:%.*]] = insertelement <8 x half> undef, half [[TMP0]], i32 0
+// CHECK:   [[VECINIT1:%.*]] = insertelement <8 x half> [[VECINIT]], half [[TMP0]], i32 1
+// CHECK:   [[VECINIT2:%.*]] = insertelement <8 x half> [[VECINIT1]], half [[TMP0]], i32 2
+// CHECK:   [[VECINIT3:%.*]] = insertelement <8 x half> [[VECINIT2]], half [[TMP0]], i32 3
+// CHECK:   [[VECINIT4:%.*]] = insertelement <8 x half> [[VECINIT3]], half [[TMP0]], i32 4
+// CHECK:   [[VECINIT5:%.*]] = insertelement <8 x half> [[VECINIT4]], half [[TMP0]], i32 5
+// CHECK:   [[VECINIT6:%.*]] = insertelement <8 x half> [[VECINIT5]], half [[TMP0]], i32 6
+// CHECK:   [[VECINIT7:%.*]] = insertelement <8 x half> [[VECINIT6]], half [[TMP0]], i32 7
+// CHECK:   ret <8 x half> [[VECINIT7]]
 float16x8_t test_vmovq_n_f16(float16_t *a) {
   return vmovq_n_f16(*a);
 }
 
-// CHECK-LABEL: test_vmovq_n_f32
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmovq_n_f32(float %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %a, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %a, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VECINIT3_I]]
 float32x4_t test_vmovq_n_f32(float32_t a) {
   return vmovq_n_f32(a);
 }
 
-// CHECK-LABEL: test_vmov_n_s64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 int64x1_t test_vmov_n_s64(int64_t a) {
   int64x1_t tmp = vmov_n_s64(a);
   return vadd_s64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmov_n_u64
-// CHECK: vmov.32 [[REG:d[0-9]+]][0], r0
-// CHECK: vmov.32 [[REG]][1], r1
+// CHECK-LABEL: define <1 x i64> @test_vmov_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <1 x i64> undef, i64 %a, i32 0
+// CHECK:   [[ADD_I:%.*]] = add <1 x i64> [[VECINIT_I]], [[VECINIT_I]]
+// CHECK:   ret <1 x i64> [[ADD_I]]
 uint64x1_t test_vmov_n_u64(uint64_t a) {
   uint64x1_t tmp = vmov_n_u64(a);
   return vadd_u64(tmp, tmp);
 }
 
-// CHECK-LABEL: test_vmovq_n_s64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_s64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 int64x2_t test_vmovq_n_s64(int64_t a) {
   return vmovq_n_s64(a);
 }
 
-// CHECK-LABEL: test_vmovq_n_u64
-// CHECK: vmov {{r[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmovq_n_u64(i64 %a) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i64> undef, i64 %a, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i64> [[VECINIT_I]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VECINIT1_I]]
 uint64x2_t test_vmovq_n_u64(uint64_t a) {
   return vmovq_n_u64(a);
 }
 
 
-// CHECK-LABEL: test_vmul_s8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 int8x8_t test_vmul_s8(int8x8_t a, int8x8_t b) {
   return vmul_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_s16(int16x4_t a, int16x4_t b) {
   return vmul_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_s32(int32x2_t a, int32x2_t b) {
   return vmul_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_f32(float32x2_t a, float32x2_t b) {
   return vmul_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u8
-// CHECK: vmul.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[MUL_I]]
 uint8x8_t test_vmul_u8(uint8x8_t a, uint8x8_t b) {
   return vmul_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 int8x16_t test_vmulq_s8(int8x16_t a, int8x16_t b) {
   return vmulq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_s16(int16x8_t a, int16x8_t b) {
   return vmulq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_s32(int32x4_t a, int32x4_t b) {
   return vmulq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vmulq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_f32(float32x4_t a, float32x4_t b) {
   return vmulq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u8
-// CHECK: vmul.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[MUL_I]]
 uint8x16_t test_vmulq_u8(uint8x16_t a, uint8x16_t b) {
   return vmulq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_u16(uint16x8_t a, uint16x8_t b) {
   return vmulq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_u32(uint32x4_t a, uint32x4_t b) {
   return vmulq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_s8
-// CHECK: vmull.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 int16x8_t test_vmull_s8(int8x8_t a, int8x8_t b) {
   return vmull_s8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_s16(int16x4_t a, int16x4_t b) {
   return vmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_s32(int32x2_t a, int32x2_t b) {
   return vmull_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u8
-// CHECK: vmull.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 uint16x8_t test_vmull_u8(uint8x8_t a, uint8x8_t b) {
   return vmull_u8(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_p8
-// CHECK: vmull.p8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmull_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMULL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VMULL_I]]
 poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) {
   return vmull_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmull_lane_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vmull_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmull_lane_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmull_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL1_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL2_I]]
 uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmull_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmull_lane_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vmull_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL1_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL2_I]]
 uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmull_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmull_n_s16
-// CHECK: vmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 int32x4_t test_vmull_n_s16(int16x4_t a, int16_t b) {
   return vmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_s32
-// CHECK: vmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 int64x2_t test_vmull_n_s32(int32x2_t a, int32_t b) {
   return vmull_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u16
-// CHECK: vmull.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmull_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMULL4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VMULL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[VMULL_I]], <4 x i16> [[VMULL4_I]]) #4
+// CHECK:   ret <4 x i32> [[VMULL5_I]]
 uint32x4_t test_vmull_n_u16(uint16x4_t a, uint16_t b) {
   return vmull_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmull_n_u32
-// CHECK: vmull.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vmull_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VMULL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMULL2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VMULL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[VMULL_I]], <2 x i32> [[VMULL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VMULL3_I]]
 uint64x2_t test_vmull_n_u32(uint32x2_t a, uint32_t b) {
   return vmull_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_p8
-// CHECK: vmul.p8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmul_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMUL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VMUL_V_I]]
 poly8x8_t test_vmul_p8(poly8x8_t a, poly8x8_t b) {
   return vmul_p8(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_p8
-// CHECK: vmul.p8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmulq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VMULQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VMULQ_V_I]]
 poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) {
   return vmulq_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vmul_lane_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) {
   return vmul_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) {
   return vmul_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x float> @test_vmul_lane_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x float> [[MUL]]
 float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) {
   return vmul_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmul_lane_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vmul_lane_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i16> [[MUL]]
 uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) {
   return vmul_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmul_lane_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vmul_lane_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <2 x i32> [[MUL]]
 uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) {
   return vmul_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) {
   return vmulq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) {
   return vmulq_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x float> @test_vmulq_lane_f32(<4 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x float> [[MUL]]
 float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) {
   return vmulq_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vmulq_lane_u16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]]
+// CHECK:   ret <8 x i16> [[MUL]]
 uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) {
   return vmulq_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vmulq_lane_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vmulq_lane_u32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]]
+// CHECK:   ret <4 x i32> [[MUL]]
 uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) {
   return vmulq_lane_u32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vmul_n_s16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 int16x4_t test_vmul_n_s16(int16x4_t a, int16_t b) {
   return vmul_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_s32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 int32x2_t test_vmul_n_s32(int32x2_t a, int32_t b) {
   return vmul_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_f32
-// CHECK: vmul.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = fmul <2 x float> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x float> [[MUL_I]]
 float32x2_t test_vmul_n_f32(float32x2_t a, float32_t b) {
   return vmul_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u16
-// CHECK: vmul.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmul_n_u16(<4 x i16> %a, i16 zeroext %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i16> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i16> [[MUL_I]]
 uint16x4_t test_vmul_n_u16(uint16x4_t a, uint16_t b) {
   return vmul_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmul_n_u32
-// CHECK: vmul.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmul_n_u32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[MUL_I:%.*]] = mul <2 x i32> %a, [[VECINIT1_I]]
+// CHECK:   ret <2 x i32> [[MUL_I]]
 uint32x2_t test_vmul_n_u32(uint32x2_t a, uint32_t b) {
   return vmul_n_u32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 int16x8_t test_vmulq_n_s16(int16x8_t a, int16_t b) {
   return vmulq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_s32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 int32x4_t test_vmulq_n_s32(int32x4_t a, int32_t b) {
   return vmulq_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_f32
-// CHECK: vmul.f32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[0]
+// CHECK-LABEL: define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x float> [[VECINIT_I]], float %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x float> [[VECINIT1_I]], float %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x float> [[VECINIT2_I]], float %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = fmul <4 x float> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x float> [[MUL_I]]
 float32x4_t test_vmulq_n_f32(float32x4_t a, float32_t b) {
   return vmulq_n_f32(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u16
-// CHECK: vmul.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmulq_n_u16(<8 x i16> %a, i16 zeroext %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[MUL_I:%.*]] = mul <8 x i16> %a, [[VECINIT7_I]]
+// CHECK:   ret <8 x i16> [[MUL_I]]
 uint16x8_t test_vmulq_n_u16(uint16x8_t a, uint16_t b) {
   return vmulq_n_u16(a, b);
 }
 
-// CHECK-LABEL: test_vmulq_n_u32
-// CHECK: vmul.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmulq_n_u32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[MUL_I:%.*]] = mul <4 x i32> %a, [[VECINIT3_I]]
+// CHECK:   ret <4 x i32> [[MUL_I]]
 uint32x4_t test_vmulq_n_u32(uint32x4_t a, uint32_t b) {
   return vmulq_n_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vmvn_s8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_s8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 int8x8_t test_vmvn_s8(int8x8_t a) {
   return vmvn_s8(a);
 }
 
-// CHECK-LABEL: test_vmvn_s16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_s16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 int16x4_t test_vmvn_s16(int16x4_t a) {
   return vmvn_s16(a);
 }
 
-// CHECK-LABEL: test_vmvn_s32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_s32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 int32x2_t test_vmvn_s32(int32x2_t a) {
   return vmvn_s32(a);
 }
 
-// CHECK-LABEL: test_vmvn_u8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_u8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 uint8x8_t test_vmvn_u8(uint8x8_t a) {
   return vmvn_u8(a);
 }
 
-// CHECK-LABEL: test_vmvn_u16
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vmvn_u16(<4 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <4 x i16> [[NEG_I]]
 uint16x4_t test_vmvn_u16(uint16x4_t a) {
   return vmvn_u16(a);
 }
 
-// CHECK-LABEL: test_vmvn_u32
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vmvn_u32(<2 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %a, <i32 -1, i32 -1>
+// CHECK:   ret <2 x i32> [[NEG_I]]
 uint32x2_t test_vmvn_u32(uint32x2_t a) {
   return vmvn_u32(a);
 }
 
-// CHECK-LABEL: test_vmvn_p8
-// CHECK: vmvn d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vmvn_p8(<8 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <8 x i8> [[NEG_I]]
 poly8x8_t test_vmvn_p8(poly8x8_t a) {
   return vmvn_p8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 int8x16_t test_vmvnq_s8(int8x16_t a) {
   return vmvnq_s8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 int16x8_t test_vmvnq_s16(int16x8_t a) {
   return vmvnq_s16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_s32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 int32x4_t test_vmvnq_s32(int32x4_t a) {
   return vmvnq_s32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 uint8x16_t test_vmvnq_u8(uint8x16_t a) {
   return vmvnq_u8(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u16
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vmvnq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %a, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   ret <8 x i16> [[NEG_I]]
 uint16x8_t test_vmvnq_u16(uint16x8_t a) {
   return vmvnq_u16(a);
 }
 
-// CHECK-LABEL: test_vmvnq_u32
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vmvnq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   ret <4 x i32> [[NEG_I]]
 uint32x4_t test_vmvnq_u32(uint32x4_t a) {
   return vmvnq_u32(a);
 }
 
-// CHECK-LABEL: test_vmvnq_p8
-// CHECK: vmvn q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vmvnq_p8(<16 x i8> %a) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %a, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   ret <16 x i8> [[NEG_I]]
 poly8x16_t test_vmvnq_p8(poly8x16_t a) {
   return vmvnq_p8(a);
 }
 
 
-// CHECK-LABEL: test_vneg_s8
-// CHECK: vneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> zeroinitializer, %a
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vneg_s8(int8x8_t a) {
   return vneg_s8(a);
 }
 
-// CHECK-LABEL: test_vneg_s16
-// CHECK: vneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> zeroinitializer, %a
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vneg_s16(int16x4_t a) {
   return vneg_s16(a);
 }
 
-// CHECK-LABEL: test_vneg_s32
-// CHECK: vneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> zeroinitializer, %a
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vneg_s32(int32x2_t a) {
   return vneg_s32(a);
 }
 
-// CHECK-LABEL: test_vneg_f32
-// CHECK: vneg.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vneg_f32(<2 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vneg_f32(float32x2_t a) {
   return vneg_f32(a);
 }
 
-// CHECK-LABEL: test_vnegq_s8
-// CHECK: vneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> zeroinitializer, %a
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vnegq_s8(int8x16_t a) {
   return vnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vnegq_s16
-// CHECK: vneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> zeroinitializer, %a
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vnegq_s16(int16x8_t a) {
   return vnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vnegq_s32
-// CHECK: vneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, %a
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vnegq_s32(int32x4_t a) {
   return vnegq_s32(a);
 }
 
-// CHECK-LABEL: test_vnegq_f32
-// CHECK: vneg.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vnegq_f32(<4 x float> %a) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vnegq_f32(float32x4_t a) {
   return vnegq_f32(a);
 }
 
 
-// CHECK-LABEL: test_vorn_s8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorn_s8(int8x8_t a, int8x8_t b) {
   return vorn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorn_s16(int16x4_t a, int16x4_t b) {
   return vorn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorn_s32(int32x2_t a, int32x2_t b) {
   return vorn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_s64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorn_s64(int64x1_t a, int64x1_t b) {
   return vorn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u8
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorn_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, [[NEG_I]]
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorn_u8(uint8x8_t a, uint8x8_t b) {
   return vorn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u16
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorn_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, [[NEG_I]]
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorn_u16(uint16x4_t a, uint16x4_t b) {
   return vorn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u32
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorn_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i32> %b, <i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, [[NEG_I]]
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorn_u32(uint32x2_t a, uint32x2_t b) {
   return vorn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorn_u64
-// CHECK: vorn d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorn_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <1 x i64> %b, <i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, [[NEG_I]]
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorn_u64(uint64x1_t a, uint64x1_t b) {
   return vorn_u64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vornq_s8(int8x16_t a, int8x16_t b) {
   return vornq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vornq_s16(int16x8_t a, int16x8_t b) {
   return vornq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vornq_s32(int32x4_t a, int32x4_t b) {
   return vornq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_s64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vornq_s64(int64x2_t a, int64x2_t b) {
   return vornq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u8
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vornq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <16 x i8> %b, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, [[NEG_I]]
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vornq_u8(uint8x16_t a, uint8x16_t b) {
   return vornq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u16
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vornq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <8 x i16> %b, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, [[NEG_I]]
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vornq_u16(uint16x8_t a, uint16x8_t b) {
   return vornq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u32
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vornq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, [[NEG_I]]
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vornq_u32(uint32x4_t a, uint32x4_t b) {
   return vornq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vornq_u64
-// CHECK: vorn q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vornq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[NEG_I:%.*]] = xor <2 x i64> %b, <i64 -1, i64 -1>
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, [[NEG_I]]
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vornq_u64(uint64x2_t a, uint64x2_t b) {
   return vornq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vorr_s8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 int8x8_t test_vorr_s8(int8x8_t a, int8x8_t b) {
   return vorr_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 int16x4_t test_vorr_s16(int16x4_t a, int16x4_t b) {
   return vorr_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 int32x2_t test_vorr_s32(int32x2_t a, int32x2_t b) {
   return vorr_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_s64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 int64x1_t test_vorr_s64(int64x1_t a, int64x1_t b) {
   return vorr_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u8
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vorr_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[OR_I]]
 uint8x8_t test_vorr_u8(uint8x8_t a, uint8x8_t b) {
   return vorr_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u16
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vorr_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[OR_I]]
 uint16x4_t test_vorr_u16(uint16x4_t a, uint16x4_t b) {
   return vorr_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u32
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vorr_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[OR_I]]
 uint32x2_t test_vorr_u32(uint32x2_t a, uint32x2_t b) {
   return vorr_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorr_u64
-// CHECK: vorr d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vorr_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[OR_I]]
 uint64x1_t test_vorr_u64(uint64x1_t a, uint64x1_t b) {
   return vorr_u64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 int8x16_t test_vorrq_s8(int8x16_t a, int8x16_t b) {
   return vorrq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 int16x8_t test_vorrq_s16(int16x8_t a, int16x8_t b) {
   return vorrq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 int32x4_t test_vorrq_s32(int32x4_t a, int32x4_t b) {
   return vorrq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_s64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 int64x2_t test_vorrq_s64(int64x2_t a, int64x2_t b) {
   return vorrq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u8
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vorrq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[OR_I]]
 uint8x16_t test_vorrq_u8(uint8x16_t a, uint8x16_t b) {
   return vorrq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u16
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vorrq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[OR_I]]
 uint16x8_t test_vorrq_u16(uint16x8_t a, uint16x8_t b) {
   return vorrq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u32
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vorrq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[OR_I]]
 uint32x4_t test_vorrq_u32(uint32x4_t a, uint32x4_t b) {
   return vorrq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vorrq_u64
-// CHECK: vorr q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vorrq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[OR_I:%.*]] = or <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[OR_I]]
 uint64x2_t test_vorrq_u64(uint64x2_t a, uint64x2_t b) {
   return vorrq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadal_s8
-// CHECK: vpadal.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_s8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadals.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
+// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 int16x4_t test_vpadal_s8(int16x4_t a, int8x8_t b) {
   return vpadal_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s16
-// CHECK: vpadal.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_s16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadals.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 int32x2_t test_vpadal_s16(int32x2_t a, int16x4_t b) {
   return vpadal_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_s32
-// CHECK: vpadal.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_s32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadals.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 int64x1_t test_vpadal_s32(int64x1_t a, int32x2_t b) {
   return vpadal_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u8
-// CHECK: vpadal.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadal_u8(<4 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADAL_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadalu.v4i16.v8i8(<4 x i16> [[VPADAL_V_I]], <8 x i8> %b) #4
+// CHECK:   ret <4 x i16> [[VPADAL_V1_I]]
 uint16x4_t test_vpadal_u8(uint16x4_t a, uint8x8_t b) {
   return vpadal_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u16
-// CHECK: vpadal.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadal_u16(<2 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadalu.v2i32.v4i16(<2 x i32> [[VPADAL_V_I]], <4 x i16> [[VPADAL_V1_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADAL_V2_I]]
 uint32x2_t test_vpadal_u16(uint32x2_t a, uint16x4_t b) {
   return vpadal_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadal_u32
-// CHECK: vpadal.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpadal_u32(<1 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADAL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VPADAL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADAL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpadalu.v1i64.v2i32(<1 x i64> [[VPADAL_V_I]], <2 x i32> [[VPADAL_V1_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADAL_V2_I]]
 uint64x1_t test_vpadal_u32(uint64x1_t a, uint32x2_t b) {
   return vpadal_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s8
-// CHECK: vpadal.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_s8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadals.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 int16x8_t test_vpadalq_s8(int16x8_t a, int8x16_t b) {
   return vpadalq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s16
-// CHECK: vpadal.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_s16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadals.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 int32x4_t test_vpadalq_s16(int32x4_t a, int16x8_t b) {
   return vpadalq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_s32
-// CHECK: vpadal.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_s32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadals.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 int64x2_t test_vpadalq_s32(int64x2_t a, int32x4_t b) {
   return vpadalq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u8
-// CHECK: vpadal.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpadalq_u8(<8 x i16> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpadalu.v8i16.v16i8(<8 x i16> [[VPADALQ_V_I]], <16 x i8> %b) #4
+// CHECK:   ret <8 x i16> [[VPADALQ_V1_I]]
 uint16x8_t test_vpadalq_u8(uint16x8_t a, uint8x16_t b) {
   return vpadalq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u16
-// CHECK: vpadal.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpadalq_u16(<4 x i32> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpadalu.v4i32.v8i16(<4 x i32> [[VPADALQ_V_I]], <8 x i16> [[VPADALQ_V1_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADALQ_V2_I]]
 uint32x4_t test_vpadalq_u16(uint32x4_t a, uint16x8_t b) {
   return vpadalq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadalq_u32
-// CHECK: vpadal.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpadalq_u32(<2 x i64> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VPADALQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VPADALQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VPADALQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpadalu.v2i64.v4i32(<2 x i64> [[VPADALQ_V_I]], <4 x i32> [[VPADALQ_V1_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADALQ_V2_I]]
 uint64x2_t test_vpadalq_u32(uint64x2_t a, uint32x4_t b) {
   return vpadalq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpadd_s8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 int8x8_t test_vpadd_s8(int8x8_t a, int8x8_t b) {
   return vpadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpadd_s16(int16x4_t a, int16x4_t b) {
   return vpadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_s32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpadd_s32(int32x2_t a, int32x2_t b) {
   return vpadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u8
-// CHECK: vpadd.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPADD_V_I]]
 uint8x8_t test_vpadd_u8(uint8x8_t a, uint8x8_t b) {
   return vpadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u16
-// CHECK: vpadd.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> [[VPADD_V_I]], <4 x i16> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <4 x i16> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpadd_u16(uint16x4_t a, uint16x4_t b) {
   return vpadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_u32
-// CHECK: vpadd.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> [[VPADD_V_I]], <2 x i32> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x i32> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
   return vpadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpadd_f32
-// CHECK: vpadd.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpadd_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> [[VPADD_V_I]], <2 x float> [[VPADD_V1_I]]) #4
+// CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPADD_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
   return vpadd_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpaddl_s8
-// CHECK: vpaddl.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 int16x4_t test_vpaddl_s8(int8x8_t a) {
   return vpaddl_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s16
-// CHECK: vpaddl.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 int32x2_t test_vpaddl_s16(int16x4_t a) {
   return vpaddl_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_s32
-// CHECK: vpaddl.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 int64x1_t test_vpaddl_s32(int32x2_t a) {
   return vpaddl_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u8
-// CHECK: vpaddl.u8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpaddl_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <4 x i16> [[VPADDL_I]]
 uint16x4_t test_vpaddl_u8(uint8x8_t a) {
   return vpaddl_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u16
-// CHECK: vpaddl.u16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpaddl_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i32> [[VPADDL1_I]]
 uint32x2_t test_vpaddl_u16(uint16x4_t a) {
   return vpaddl_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddl_u32
-// CHECK: vpaddl.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vpaddl_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <1 x i64> [[VPADDL1_I]]
 uint64x1_t test_vpaddl_u32(uint32x2_t a) {
   return vpaddl_u32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s8
-// CHECK: vpaddl.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 int16x8_t test_vpaddlq_s8(int8x16_t a) {
   return vpaddlq_s8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s16
-// CHECK: vpaddl.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 int32x4_t test_vpaddlq_s16(int16x8_t a) {
   return vpaddlq_s16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_s32
-// CHECK: vpaddl.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 int64x2_t test_vpaddlq_s32(int32x4_t a) {
   return vpaddlq_s32(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u8
-// CHECK: vpaddl.u8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vpaddlq_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VPADDL_I:%.*]] = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <8 x i16> [[VPADDL_I]]
 uint16x8_t test_vpaddlq_u8(uint8x16_t a) {
   return vpaddlq_u8(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u16
-// CHECK: vpaddl.u16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vpaddlq_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VPADDL1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> [[VPADDL_I]]) #4
+// CHECK:   ret <4 x i32> [[VPADDL1_I]]
 uint32x4_t test_vpaddlq_u16(uint16x8_t a) {
   return vpaddlq_u16(a);
 }
 
-// CHECK-LABEL: test_vpaddlq_u32
-// CHECK: vpaddl.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vpaddlq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VPADDL_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VPADDL1_I:%.*]] = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> [[VPADDL_I]]) #4
+// CHECK:   ret <2 x i64> [[VPADDL1_I]]
 uint64x2_t test_vpaddlq_u32(uint32x4_t a) {
   return vpaddlq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vpmax_s8
-// CHECK: vpmax.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 int8x8_t test_vpmax_s8(int8x8_t a, int8x8_t b) {
   return vpmax_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s16
-// CHECK: vpmax.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpmax_s16(int16x4_t a, int16x4_t b) {
   return vpmax_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_s32
-// CHECK: vpmax.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpmax_s32(int32x2_t a, int32x2_t b) {
   return vpmax_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u8
-// CHECK: vpmax.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmax_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMAX_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMAX_V_I]]
 uint8x8_t test_vpmax_u8(uint8x8_t a, uint8x8_t b) {
   return vpmax_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u16
-// CHECK: vpmax.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmax_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> [[VPMAX_V_I]], <4 x i16> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <4 x i16> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpmax_u16(uint16x4_t a, uint16x4_t b) {
   return vpmax_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_u32
-// CHECK: vpmax.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmax_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> [[VPMAX_V_I]], <2 x i32> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x i32> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpmax_u32(uint32x2_t a, uint32x2_t b) {
   return vpmax_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmax_f32
-// CHECK: vpmax.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmax_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMAX_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMAX_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMAX_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> [[VPMAX_V_I]], <2 x float> [[VPMAX_V1_I]]) #4
+// CHECK:   [[VPMAX_V3_I:%.*]] = bitcast <2 x float> [[VPMAX_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMAX_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpmax_f32(float32x2_t a, float32x2_t b) {
   return vpmax_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vpmin_s8
-// CHECK: vpmin.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 int8x8_t test_vpmin_s8(int8x8_t a, int8x8_t b) {
   return vpmin_s8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s16
-// CHECK: vpmin.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vpmin_s16(int16x4_t a, int16x4_t b) {
   return vpmin_s16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_s32
-// CHECK: vpmin.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vpmin_s32(int32x2_t a, int32x2_t b) {
   return vpmin_s32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u8
-// CHECK: vpmin.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vpmin_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VPMIN_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VPMIN_V_I]]
 uint8x8_t test_vpmin_u8(uint8x8_t a, uint8x8_t b) {
   return vpmin_u8(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u16
-// CHECK: vpmin.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vpmin_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> [[VPMIN_V_I]], <4 x i16> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <4 x i16> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vpmin_u16(uint16x4_t a, uint16x4_t b) {
   return vpmin_u16(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_u32
-// CHECK: vpmin.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vpmin_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> [[VPMIN_V_I]], <2 x i32> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x i32> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vpmin_u32(uint32x2_t a, uint32x2_t b) {
   return vpmin_u32(a, b);
 }
 
-// CHECK-LABEL: test_vpmin_f32
-// CHECK: vpmin.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vpmin_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VPMIN_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VPMIN_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VPMIN_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> [[VPMIN_V_I]], <2 x float> [[VPMIN_V1_I]]) #4
+// CHECK:   [[VPMIN_V3_I:%.*]] = bitcast <2 x float> [[VPMIN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VPMIN_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vpmin_f32(float32x2_t a, float32x2_t b) {
   return vpmin_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqabs_s8
-// CHECK: vqabs.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqabs_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQABS_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQABS_V_I]]
 int8x8_t test_vqabs_s8(int8x8_t a) {
   return vqabs_s8(a);
 }
 
-// CHECK-LABEL: test_vqabs_s16
-// CHECK: vqabs.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqabs_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <4 x i16> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqabs_s16(int16x4_t a) {
   return vqabs_s16(a);
 }
 
-// CHECK-LABEL: test_vqabs_s32
-// CHECK: vqabs.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqabs_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQABS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQABS_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> [[VQABS_V_I]]) #4
+// CHECK:   [[VQABS_V2_I:%.*]] = bitcast <2 x i32> [[VQABS_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQABS_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqabs_s32(int32x2_t a) {
   return vqabs_s32(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s8
-// CHECK: vqabs.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqabsq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQABSQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQABSQ_V_I]]
 int8x16_t test_vqabsq_s8(int8x16_t a) {
   return vqabsq_s8(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s16
-// CHECK: vqabs.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqabsq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <8 x i16> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqabsq_s16(int16x8_t a) {
   return vqabsq_s16(a);
 }
 
-// CHECK-LABEL: test_vqabsq_s32
-// CHECK: vqabs.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqabsq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQABSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQABSQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> [[VQABSQ_V_I]]) #4
+// CHECK:   [[VQABSQ_V2_I:%.*]] = bitcast <4 x i32> [[VQABSQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQABSQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqabsq_s32(int32x4_t a) {
   return vqabsq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqadd_s8
-// CHECK: vqadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 int8x8_t test_vqadd_s8(int8x8_t a, int8x8_t b) {
   return vqadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s16
-// CHECK: vqadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqadd_s16(int16x4_t a, int16x4_t b) {
   return vqadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s32
-// CHECK: vqadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqadd_s32(int32x2_t a, int32x2_t b) {
   return vqadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_s64
-// CHECK: vqadd.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqadd_s64(int64x1_t a, int64x1_t b) {
   return vqadd_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u8
-// CHECK: vqadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQADD_V_I]]
 uint8x8_t test_vqadd_u8(uint8x8_t a, uint8x8_t b) {
   return vqadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u16
-// CHECK: vqadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> [[VQADD_V_I]], <4 x i16> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <4 x i16> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqadd_u16(uint16x4_t a, uint16x4_t b) {
   return vqadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u32
-// CHECK: vqadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> [[VQADD_V_I]], <2 x i32> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <2 x i32> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqadd_u32(uint32x2_t a, uint32x2_t b) {
   return vqadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqadd_u64
-// CHECK: vqadd.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqadd_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQADD_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> [[VQADD_V_I]], <1 x i64> [[VQADD_V1_I]]) #4
+// CHECK:   [[VQADD_V3_I:%.*]] = bitcast <1 x i64> [[VQADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQADD_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqadd_u64(uint64x1_t a, uint64x1_t b) {
   return vqadd_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s8
-// CHECK: vqadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 int8x16_t test_vqaddq_s8(int8x16_t a, int8x16_t b) {
   return vqaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s16
-// CHECK: vqadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqaddq_s16(int16x8_t a, int16x8_t b) {
   return vqaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s32
-// CHECK: vqadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqaddq_s32(int32x4_t a, int32x4_t b) {
   return vqaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_s64
-// CHECK: vqadd.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqaddq_s64(int64x2_t a, int64x2_t b) {
   return vqaddq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u8
-// CHECK: vqadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQADDQ_V_I]]
 uint8x16_t test_vqaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vqaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u16
-// CHECK: vqadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> [[VQADDQ_V_I]], <8 x i16> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vqaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u32
-// CHECK: vqadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> [[VQADDQ_V_I]], <4 x i32> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vqaddq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqaddq_u64
-// CHECK: vqadd.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqaddq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQADDQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> [[VQADDQ_V_I]], <2 x i64> [[VQADDQ_V1_I]]) #4
+// CHECK:   [[VQADDQ_V3_I:%.*]] = bitcast <2 x i64> [[VQADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQADDQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqaddq_u64(uint64x2_t a, uint64x2_t b) {
   return vqaddq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_lane_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V3_I]]
 int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlal_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlal_lane_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V3_I]]
 int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlal_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlal_n_s16
-// CHECK: vqdmlal.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlal_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLAL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> [[VQDMLAL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLAL_V6_I]]
 int32x4_t test_vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlal_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlal_n_s32
-// CHECK: vqdmlal.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlal_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLAL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLAL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> [[VQDMLAL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLAL_V4_I]]
 int64x2_t test_vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlal_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %c to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_lane_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_lane_s16(<4 x i32> %a, <4 x i16> %b, <4 x i16> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V3_I]]
 int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) {
   return vqdmlsl_lane_s16(a, b, c, 3);
 }
 
-// CHECK-LABEL: test_vqdmlsl_lane_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_lane_s32(<2 x i64> %a, <2 x i32> %b, <2 x i32> %c) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL1_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL1_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL2_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V3_I]]
 int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) {
   return vqdmlsl_lane_s32(a, b, c, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmlsl_n_s16
-// CHECK: vqdmlsl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmlsl_n_s16(<4 x i32> %a, <4 x i16> %b, i16 signext %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %c, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %c, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %c, i32 3
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMLAL4_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VQDMLAL5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMLAL_I]], <4 x i16> [[VQDMLAL4_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMLSL_V6_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQDMLSL_V_I]], <4 x i32> [[VQDMLAL5_I]]) #4
+// CHECK:   ret <4 x i32> [[VQDMLSL_V6_I]]
 int32x4_t test_vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) {
   return vqdmlsl_n_s16(a, b, c);
 }
 
-// CHECK-LABEL: test_vqdmlsl_n_s32
-// CHECK: vqdmlsl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmlsl_n_s32(<2 x i64> %a, <2 x i32> %b, i32 %c) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %c, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %c, i32 1
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMLAL_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMLAL2_I:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VQDMLAL3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMLAL_I]], <2 x i32> [[VQDMLAL2_I]]) #4
+// CHECK:   [[VQDMLSL_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQDMLSL_V4_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQDMLSL_V_I]], <2 x i64> [[VQDMLAL3_I]]) #4
+// CHECK:   ret <2 x i64> [[VQDMLSL_V4_I]]
 int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) {
   return vqdmlsl_n_s32(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_lane_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulh_lane_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V1_I]]) #4
+// CHECK:   [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmulhq_lane_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V1_I]]) #4
+// CHECK:   [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmulh_n_s16
-// CHECK: vqdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[VQDMULH_V_I]], <4 x i16> [[VQDMULH_V4_I]]) #4
+// CHECK:   [[VQDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulh_n_s32
-// CHECK: vqdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[VQDMULH_V_I]], <2 x i32> [[VQDMULH_V2_I]]) #4
+// CHECK:   [[VQDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s16
-// CHECK: vqdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[VQDMULHQ_V_I]], <8 x i16> [[VQDMULHQ_V8_I]]) #4
+// CHECK:   [[VQDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmulhq_n_s32
-// CHECK: vqdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[VQDMULHQ_V_I]], <4 x i32> [[VQDMULHQ_V4_I]]) #4
+// CHECK:   [[VQDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqdmull_lane_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) {
   return vqdmull_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqdmull_lane_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V1_I]]) #4
+// CHECK:   [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) {
   return vqdmull_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqdmull_n_s16
-// CHECK: vqdmull.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqdmull_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQDMULL_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[VQDMULL_V_I]], <4 x i16> [[VQDMULL_V4_I]]) #4
+// CHECK:   [[VQDMULL_V6_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqdmull_n_s16(int16x4_t a, int16_t b) {
   return vqdmull_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqdmull_n_s32
-// CHECK: vqdmull.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqdmull_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQDMULL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQDMULL_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQDMULL_V3_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[VQDMULL_V_I]], <2 x i32> [[VQDMULL_V2_I]]) #4
+// CHECK:   [[VQDMULL_V4_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V3_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQDMULL_V4_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqdmull_n_s32(int32x2_t a, int32_t b) {
   return vqdmull_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqmovn_s16
-// CHECK: vqmovn.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 int8x8_t test_vqmovn_s16(int16x8_t a) {
   return vqmovn_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s32
-// CHECK: vqmovn.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqmovn_s32(int32x4_t a) {
   return vqmovn_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_s64
-// CHECK: vqmovn.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqmovn_s64(int64x2_t a) {
   return vqmovn_s64(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u16
-// CHECK: vqmovn.u16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovn_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> [[VQMOVN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVN_V1_I]]
 uint8x8_t test_vqmovn_u16(uint16x8_t a) {
   return vqmovn_u16(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u32
-// CHECK: vqmovn.u32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovn_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovn_u32(uint32x4_t a) {
   return vqmovn_u32(a);
 }
 
-// CHECK-LABEL: test_vqmovn_u64
-// CHECK: vqmovn.u64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovn_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> [[VQMOVN_V_I]]) #4
+// CHECK:   [[VQMOVN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovn_u64(uint64x2_t a) {
   return vqmovn_u64(a);
 }
 
 
-// CHECK-LABEL: test_vqmovun_s16
-// CHECK: vqmovun.s16 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqmovun_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> [[VQMOVUN_V_I]]) #4
+// CHECK:   ret <8 x i8> [[VQMOVUN_V1_I]]
 uint8x8_t test_vqmovun_s16(int16x8_t a) {
   return vqmovun_s16(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s32
-// CHECK: vqmovun.s32 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqmovun_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <4 x i16> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 uint16x4_t test_vqmovun_s32(int32x4_t a) {
   return vqmovun_s32(a);
 }
 
-// CHECK-LABEL: test_vqmovun_s64
-// CHECK: vqmovun.s64 d{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqmovun_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQMOVUN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQMOVUN_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> [[VQMOVUN_V_I]]) #4
+// CHECK:   [[VQMOVUN_V2_I:%.*]] = bitcast <2 x i32> [[VQMOVUN_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQMOVUN_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 uint32x2_t test_vqmovun_s64(int64x2_t a) {
   return vqmovun_s64(a);
 }
 
 
-// CHECK-LABEL: test_vqneg_s8
-// CHECK: vqneg.s8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqneg_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQNEG_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %a) #4
+// CHECK:   ret <8 x i8> [[VQNEG_V_I]]
 int8x8_t test_vqneg_s8(int8x8_t a) {
   return vqneg_s8(a);
 }
 
-// CHECK-LABEL: test_vqneg_s16
-// CHECK: vqneg.s16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqneg_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <4 x i16> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP1]]
 int16x4_t test_vqneg_s16(int16x4_t a) {
   return vqneg_s16(a);
 }
 
-// CHECK-LABEL: test_vqneg_s32
-// CHECK: vqneg.s32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqneg_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQNEG_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQNEG_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> [[VQNEG_V_I]]) #4
+// CHECK:   [[VQNEG_V2_I:%.*]] = bitcast <2 x i32> [[VQNEG_V1_I]] to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[VQNEG_V2_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP1]]
 int32x2_t test_vqneg_s32(int32x2_t a) {
   return vqneg_s32(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s8
-// CHECK: vqneg.s8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqnegq_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQNEGQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %a) #4
+// CHECK:   ret <16 x i8> [[VQNEGQ_V_I]]
 int8x16_t test_vqnegq_s8(int8x16_t a) {
   return vqnegq_s8(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s16
-// CHECK: vqneg.s16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqnegq_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <8 x i16> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP1]]
 int16x8_t test_vqnegq_s16(int16x8_t a) {
   return vqnegq_s16(a);
 }
 
-// CHECK-LABEL: test_vqnegq_s32
-// CHECK: vqneg.s32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqnegq_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQNEGQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQNEGQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> [[VQNEGQ_V_I]]) #4
+// CHECK:   [[VQNEGQ_V2_I:%.*]] = bitcast <4 x i32> [[VQNEGQ_V1_I]] to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[VQNEGQ_V2_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP1]]
 int32x4_t test_vqnegq_s32(int32x4_t a) {
   return vqnegq_s32(a);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_s16(int16x8_t a, int16x8_t b) {
   return vqrdmulhq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) {
   return vqrdmulhq_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_lane_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_lane_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) {
   return vqrdmulh_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulh_lane_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_lane_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> <i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V1_I]]) #4
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) {
   return vqrdmulh_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_lane_s16(<8 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) {
   return vqrdmulhq_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_lane_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}[{{[0-9]}}]
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_lane_s32(<4 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V1_I]]) #4
+// CHECK:   [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) {
   return vqrdmulhq_lane_s32(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vqrdmulh_n_s16
-// CHECK: vqrdmulh.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrdmulh_n_s16(<4 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VECINIT3_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRDMULH_V5_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[VQRDMULH_V_I]], <4 x i16> [[VQRDMULH_V4_I]]) #4
+// CHECK:   [[VQRDMULH_V6_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V5_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V6_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrdmulh_n_s16(int16x4_t a, int16_t b) {
   return vqrdmulh_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulh_n_s32
-// CHECK: vqrdmulh.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrdmulh_n_s32(<2 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <2 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <2 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> [[VECINIT1_I]] to <8 x i8>
+// CHECK:   [[VQRDMULH_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V2_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRDMULH_V3_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[VQRDMULH_V_I]], <2 x i32> [[VQRDMULH_V2_I]]) #4
+// CHECK:   [[VQRDMULH_V4_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V3_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRDMULH_V4_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrdmulh_n_s32(int32x2_t a, int32_t b) {
   return vqrdmulh_n_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s16
-// CHECK: vqrdmulh.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrdmulhq_n_s16(<8 x i16> %a, i16 signext %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <8 x i16> undef, i16 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <8 x i16> [[VECINIT_I]], i16 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I]], i16 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I]], i16 %b, i32 3
+// CHECK:   [[VECINIT4_I:%.*]] = insertelement <8 x i16> [[VECINIT3_I]], i16 %b, i32 4
+// CHECK:   [[VECINIT5_I:%.*]] = insertelement <8 x i16> [[VECINIT4_I]], i16 %b, i32 5
+// CHECK:   [[VECINIT6_I:%.*]] = insertelement <8 x i16> [[VECINIT5_I]], i16 %b, i32 6
+// CHECK:   [[VECINIT7_I:%.*]] = insertelement <8 x i16> [[VECINIT6_I]], i16 %b, i32 7
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VECINIT7_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V8_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRDMULHQ_V9_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[VQRDMULHQ_V_I]], <8 x i16> [[VQRDMULHQ_V8_I]]) #4
+// CHECK:   [[VQRDMULHQ_V10_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V9_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V10_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrdmulhq_n_s16(int16x8_t a, int16_t b) {
   return vqrdmulhq_n_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrdmulhq_n_s32
-// CHECK: vqrdmulh.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrdmulhq_n_s32(<4 x i32> %a, i32 %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VECINIT_I:%.*]] = insertelement <4 x i32> undef, i32 %b, i32 0
+// CHECK:   [[VECINIT1_I:%.*]] = insertelement <4 x i32> [[VECINIT_I]], i32 %b, i32 1
+// CHECK:   [[VECINIT2_I:%.*]] = insertelement <4 x i32> [[VECINIT1_I]], i32 %b, i32 2
+// CHECK:   [[VECINIT3_I:%.*]] = insertelement <4 x i32> [[VECINIT2_I]], i32 %b, i32 3
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> [[VECINIT3_I]] to <16 x i8>
+// CHECK:   [[VQRDMULHQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V4_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRDMULHQ_V5_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[VQRDMULHQ_V_I]], <4 x i32> [[VQRDMULHQ_V4_I]]) #4
+// CHECK:   [[VQRDMULHQ_V6_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V5_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRDMULHQ_V6_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrdmulhq_n_s32(int32x4_t a, int32_t b) {
   return vqrdmulhq_n_s32(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshl_s8
-// CHECK: vqrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 int8x8_t test_vqrshl_s8(int8x8_t a, int8x8_t b) {
   return vqrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s16
-// CHECK: vqrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqrshl_s16(int16x4_t a, int16x4_t b) {
   return vqrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s32
-// CHECK: vqrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqrshl_s32(int32x2_t a, int32x2_t b) {
   return vqrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_s64
-// CHECK: vqrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqrshl_s64(int64x1_t a, int64x1_t b) {
   return vqrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u8
-// CHECK: vqrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQRSHL_V_I]]
 uint8x8_t test_vqrshl_u8(uint8x8_t a, int8x8_t b) {
   return vqrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u16
-// CHECK: vqrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> [[VQRSHL_V_I]], <4 x i16> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqrshl_u16(uint16x4_t a, int16x4_t b) {
   return vqrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u32
-// CHECK: vqrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> [[VQRSHL_V_I]], <2 x i32> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return vqrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshl_u64
-// CHECK: vqrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> [[VQRSHL_V_I]], <1 x i64> [[VQRSHL_V1_I]]) #4
+// CHECK:   [[VQRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqrshl_u64(uint64x1_t a, int64x1_t b) {
   return vqrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s8
-// CHECK: vqrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 int8x16_t test_vqrshlq_s8(int8x16_t a, int8x16_t b) {
   return vqrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s16
-// CHECK: vqrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqrshlq_s16(int16x8_t a, int16x8_t b) {
   return vqrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s32
-// CHECK: vqrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqrshlq_s32(int32x4_t a, int32x4_t b) {
   return vqrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_s64
-// CHECK: vqrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqrshlq_s64(int64x2_t a, int64x2_t b) {
   return vqrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u8
-// CHECK: vqrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQRSHLQ_V_I]]
 uint8x16_t test_vqrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u16
-// CHECK: vqrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> [[VQRSHLQ_V_I]], <8 x i16> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u32
-// CHECK: vqrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> [[VQRSHLQ_V_I]], <4 x i32> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqrshlq_u64
-// CHECK: vqrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> [[VQRSHLQ_V_I]], <2 x i64> [[VQRSHLQ_V1_I]]) #4
+// CHECK:   [[VQRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqrshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqrshrn_n_s16
-// CHECK: vqrshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 int8x8_t test_vqrshrn_n_s16(int16x8_t a) {
   return vqrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s32
-// CHECK: vqrshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 int16x4_t test_vqrshrn_n_s32(int32x4_t a) {
   return vqrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_s64
-// CHECK: vqrshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 int32x2_t test_vqrshrn_n_s64(int64x2_t a) {
   return vqrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u16
-// CHECK: vqrshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> [[VQRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRN_N1]]
 uint8x8_t test_vqrshrn_n_u16(uint16x8_t a) {
   return vqrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u32
-// CHECK: vqrshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> [[VQRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRN_N1]]
 uint16x4_t test_vqrshrn_n_u32(uint32x4_t a) {
   return vqrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrn_n_u64
-// CHECK: vqrshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> [[VQRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRN_N1]]
 uint32x2_t test_vqrshrn_n_u64(uint64x2_t a) {
   return vqrshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqrshrun_n_s16
-// CHECK: vqrshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqrshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> [[VQRSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQRSHRUN_N1]]
 uint8x8_t test_vqrshrun_n_s16(int16x8_t a) {
   return vqrshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s32
-// CHECK: vqrshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqrshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> [[VQRSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQRSHRUN_N1]]
 uint16x4_t test_vqrshrun_n_s32(int32x4_t a) {
   return vqrshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqrshrun_n_s64
-// CHECK: vqrshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqrshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQRSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQRSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> [[VQRSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQRSHRUN_N1]]
 uint32x2_t test_vqrshrun_n_s64(int64x2_t a) {
   return vqrshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 int8x8_t test_vqshl_s8(int8x8_t a, int8x8_t b) {
   return vqshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqshl_s16(int16x4_t a, int16x4_t b) {
   return vqshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqshl_s32(int32x2_t a, int32x2_t b) {
   return vqshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqshl_s64(int64x1_t a, int64x1_t b) {
   return vqshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSHL_V_I]]
 uint8x8_t test_vqshl_u8(uint8x8_t a, int8x8_t b) {
   return vqshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_V_I]], <4 x i16> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <4 x i16> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqshl_u16(uint16x4_t a, int16x4_t b) {
   return vqshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_V_I]], <2 x i32> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <2 x i32> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqshl_u32(uint32x2_t a, int32x2_t b) {
   return vqshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshl_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_V_I]], <1 x i64> [[VQSHL_V1_I]]) #4
+// CHECK:   [[VQSHL_V3_I:%.*]] = bitcast <1 x i64> [[VQSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqshl_u64(uint64x1_t a, int64x1_t b) {
   return vqshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 int8x16_t test_vqshlq_s8(int8x16_t a, int8x16_t b) {
   return vqshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqshlq_s16(int16x8_t a, int16x8_t b) {
   return vqshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqshlq_s32(int32x4_t a, int32x4_t b) {
   return vqshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqshlq_s64(int64x2_t a, int64x2_t b) {
   return vqshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSHLQ_V_I]]
 uint8x16_t test_vqshlq_u8(uint8x16_t a, int8x16_t b) {
   return vqshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHLQ_V_I]], <8 x i16> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqshlq_u16(uint16x8_t a, int16x8_t b) {
   return vqshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHLQ_V_I]], <4 x i32> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqshlq_u32(uint32x4_t a, int32x4_t b) {
   return vqshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqshlq_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHLQ_V_I]], <2 x i64> [[VQSHLQ_V1_I]]) #4
+// CHECK:   [[VQSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqshlq_u64(uint64x2_t a, int64x2_t b) {
   return vqshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vqshlu_n_s8
-// CHECK: vqshlu.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshlu_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftsu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHLU_N]]
 uint8x8_t test_vqshlu_n_s8(int8x8_t a) {
   return vqshlu_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s16
-// CHECK: vqshlu.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshlu_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftsu.v4i16(<4 x i16> [[VQSHLU_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHLU_N1]]
 uint16x4_t test_vqshlu_n_s16(int16x4_t a) {
   return vqshlu_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s32
-// CHECK: vqshlu.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshlu_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftsu.v2i32(<2 x i32> [[VQSHLU_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHLU_N1]]
 uint32x2_t test_vqshlu_n_s32(int32x2_t a) {
   return vqshlu_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlu_n_s64
-// CHECK: vqshlu.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshlu_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftsu.v1i64(<1 x i64> [[VQSHLU_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHLU_N1]]
 uint64x1_t test_vqshlu_n_s64(int64x1_t a) {
   return vqshlu_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s8
-// CHECK: vqshlu.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshluq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHLU_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHLU_N]]
 uint8x16_t test_vqshluq_n_s8(int8x16_t a) {
   return vqshluq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s16
-// CHECK: vqshlu.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshluq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16> [[VQSHLU_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHLU_N1]]
 uint16x8_t test_vqshluq_n_s16(int16x8_t a) {
   return vqshluq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s32
-// CHECK: vqshlu.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshluq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32> [[VQSHLU_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHLU_N1]]
 uint32x4_t test_vqshluq_n_s32(int32x4_t a) {
   return vqshluq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshluq_n_s64
-// CHECK: vqshlu.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshluq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHLU_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHLU_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64> [[VQSHLU_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHLU_N1]]
 uint64x2_t test_vqshluq_n_s64(int64x2_t a) {
   return vqshluq_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshl_n_s8
-// CHECK: vqshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 int8x8_t test_vqshl_n_s8(int8x8_t a) {
   return vqshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s16
-// CHECK: vqshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 int16x4_t test_vqshl_n_s16(int16x4_t a) {
   return vqshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s32
-// CHECK: vqshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 int32x2_t test_vqshl_n_s32(int32x2_t a) {
   return vqshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_s64
-// CHECK: vqshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 int64x1_t test_vqshl_n_s64(int64x1_t a) {
   return vqshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u8
-// CHECK: vqshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VQSHL_N]]
 uint8x8_t test_vqshl_n_u8(uint8x8_t a) {
   return vqshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u16
-// CHECK: vqshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> [[VQSHL_N]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VQSHL_N1]]
 uint16x4_t test_vqshl_n_u16(uint16x4_t a) {
   return vqshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u32
-// CHECK: vqshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> [[VQSHL_N]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VQSHL_N1]]
 uint32x2_t test_vqshl_n_u32(uint32x2_t a) {
   return vqshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshl_n_u64
-// CHECK: vqshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> [[VQSHL_N]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VQSHL_N1]]
 uint64x1_t test_vqshl_n_u64(uint64x1_t a) {
   return vqshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s8
-// CHECK: vqshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 int8x16_t test_vqshlq_n_s8(int8x16_t a) {
   return vqshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s16
-// CHECK: vqshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 int16x8_t test_vqshlq_n_s16(int16x8_t a) {
   return vqshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s32
-// CHECK: vqshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 int32x4_t test_vqshlq_n_s32(int32x4_t a) {
   return vqshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_s64
-// CHECK: vqshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 int64x2_t test_vqshlq_n_s64(int64x2_t a) {
   return vqshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u8
-// CHECK: vqshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VQSHL_N:%.*]] = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VQSHL_N]]
 uint8x16_t test_vqshlq_n_u8(uint8x16_t a) {
   return vqshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u16
-// CHECK: vqshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHL_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> [[VQSHL_N]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VQSHL_N1]]
 uint16x8_t test_vqshlq_n_u16(uint16x8_t a) {
   return vqshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u32
-// CHECK: vqshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHL_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> [[VQSHL_N]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VQSHL_N1]]
 uint32x4_t test_vqshlq_n_u32(uint32x4_t a) {
   return vqshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshlq_n_u64
-// CHECK: vqshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHL_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHL_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> [[VQSHL_N]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VQSHL_N1]]
 uint64x2_t test_vqshlq_n_u64(uint64x2_t a) {
   return vqshlq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrn_n_s16
-// CHECK: vqshrn.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftns.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 int8x8_t test_vqshrn_n_s16(int16x8_t a) {
   return vqshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s32
-// CHECK: vqshrn.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftns.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 int16x4_t test_vqshrn_n_s32(int32x4_t a) {
   return vqshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_s64
-// CHECK: vqshrn.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftns.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 int32x2_t test_vqshrn_n_s64(int64x2_t a) {
   return vqshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u16
-// CHECK: vqshrn.u16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnu.v8i8(<8 x i16> [[VQSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRN_N1]]
 uint8x8_t test_vqshrn_n_u16(uint16x8_t a) {
   return vqshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u32
-// CHECK: vqshrn.u32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> [[VQSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRN_N1]]
 uint16x4_t test_vqshrn_n_u32(uint32x4_t a) {
   return vqshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrn_n_u64
-// CHECK: vqshrn.u64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64> [[VQSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRN_N1]]
 uint32x2_t test_vqshrn_n_u64(uint64x2_t a) {
   return vqshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqshrun_n_s16
-// CHECK: vqshrun.s16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqshrun_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16> [[VQSHRUN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VQSHRUN_N1]]
 uint8x8_t test_vqshrun_n_s16(int16x8_t a) {
   return vqshrun_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s32
-// CHECK: vqshrun.s32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqshrun_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32> [[VQSHRUN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VQSHRUN_N1]]
 uint16x4_t test_vqshrun_n_s32(int32x4_t a) {
   return vqshrun_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vqshrun_n_s64
-// CHECK: vqshrun.s64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqshrun_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VQSHRUN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSHRUN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64> [[VQSHRUN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VQSHRUN_N1]]
 uint32x2_t test_vqshrun_n_s64(int64x2_t a) {
   return vqshrun_n_s64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vqsub_s8
-// CHECK: vqsub.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 int8x8_t test_vqsub_s8(int8x8_t a, int8x8_t b) {
   return vqsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s16
-// CHECK: vqsub.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vqsub_s16(int16x4_t a, int16x4_t b) {
   return vqsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s32
-// CHECK: vqsub.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vqsub_s32(int32x2_t a, int32x2_t b) {
   return vqsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_s64
-// CHECK: vqsub.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vqsub_s64(int64x1_t a, int64x1_t b) {
   return vqsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u8
-// CHECK: vqsub.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vqsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VQSUB_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VQSUB_V_I]]
 uint8x8_t test_vqsub_u8(uint8x8_t a, uint8x8_t b) {
   return vqsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u16
-// CHECK: vqsub.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vqsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> [[VQSUB_V_I]], <4 x i16> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <4 x i16> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vqsub_u16(uint16x4_t a, uint16x4_t b) {
   return vqsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u32
-// CHECK: vqsub.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vqsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> [[VQSUB_V_I]], <2 x i32> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <2 x i32> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vqsub_u32(uint32x2_t a, uint32x2_t b) {
   return vqsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsub_u64
-// CHECK: vqsub.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vqsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VQSUB_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VQSUB_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VQSUB_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> [[VQSUB_V_I]], <1 x i64> [[VQSUB_V1_I]]) #4
+// CHECK:   [[VQSUB_V3_I:%.*]] = bitcast <1 x i64> [[VQSUB_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VQSUB_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vqsub_u64(uint64x1_t a, uint64x1_t b) {
   return vqsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s8
-// CHECK: vqsub.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 int8x16_t test_vqsubq_s8(int8x16_t a, int8x16_t b) {
   return vqsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s16
-// CHECK: vqsub.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vqsubq_s16(int16x8_t a, int16x8_t b) {
   return vqsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s32
-// CHECK: vqsub.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vqsubq_s32(int32x4_t a, int32x4_t b) {
   return vqsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_s64
-// CHECK: vqsub.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vqsubq_s64(int64x2_t a, int64x2_t b) {
   return vqsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u8
-// CHECK: vqsub.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vqsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VQSUBQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VQSUBQ_V_I]]
 uint8x16_t test_vqsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vqsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u16
-// CHECK: vqsub.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vqsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> [[VQSUBQ_V_I]], <8 x i16> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <8 x i16> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vqsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vqsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u32
-// CHECK: vqsub.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vqsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> [[VQSUBQ_V_I]], <4 x i32> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <4 x i32> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vqsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vqsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vqsubq_u64
-// CHECK: vqsub.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vqsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VQSUBQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VQSUBQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> [[VQSUBQ_V_I]], <2 x i64> [[VQSUBQ_V1_I]]) #4
+// CHECK:   [[VQSUBQ_V3_I:%.*]] = bitcast <2 x i64> [[VQSUBQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VQSUBQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vqsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vqsubq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vraddhn_s16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 int8x8_t test_vraddhn_s16(int16x8_t a, int16x8_t b) {
   return vraddhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vraddhn_s32(int32x4_t a, int32x4_t b) {
   return vraddhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_s64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vraddhn_s64(int64x2_t a, int64x2_t b) {
   return vraddhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u16
-// CHECK: vraddhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vraddhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> [[VRADDHN_V_I]], <8 x i16> [[VRADDHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRADDHN_V2_I]]
 uint8x8_t test_vraddhn_u16(uint16x8_t a, uint16x8_t b) {
   return vraddhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u32
-// CHECK: vraddhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vraddhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> [[VRADDHN_V_I]], <4 x i32> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <4 x i16> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vraddhn_u32(uint32x4_t a, uint32x4_t b) {
   return vraddhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vraddhn_u64
-// CHECK: vraddhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vraddhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRADDHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRADDHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRADDHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> [[VRADDHN_V_I]], <2 x i64> [[VRADDHN_V1_I]]) #4
+// CHECK:   [[VRADDHN_V3_I:%.*]] = bitcast <2 x i32> [[VRADDHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRADDHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vraddhn_u64(uint64x2_t a, uint64x2_t b) {
   return vraddhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vrecpe_f32
-// CHECK: vrecpe.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecpe_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x float> [[VRECPE_V1_I]]
 float32x2_t test_vrecpe_f32(float32x2_t a) {
   return vrecpe_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpe_u32
-// CHECK: vrecpe.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrecpe_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRECPE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRECPE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> [[VRECPE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRECPE_V1_I]]
 uint32x2_t test_vrecpe_u32(uint32x2_t a) {
   return vrecpe_u32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_f32
-// CHECK: vrecpe.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpeq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x float> [[VRECPEQ_V1_I]]
 float32x4_t test_vrecpeq_f32(float32x4_t a) {
   return vrecpeq_f32(a);
 }
 
-// CHECK-LABEL: test_vrecpeq_u32
-// CHECK: vrecpe.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrecpeq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRECPEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRECPEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> [[VRECPEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRECPEQ_V1_I]]
 uint32x4_t test_vrecpeq_u32(uint32x4_t a) {
   return vrecpeq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vrecps_f32
-// CHECK: vrecps.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrecps_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRECPS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRECPS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRECPS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> [[VRECPS_V_I]], <2 x float> [[VRECPS_V1_I]]) #4
+// CHECK:   [[VRECPS_V3_I:%.*]] = bitcast <2 x float> [[VRECPS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRECPS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrecps_f32(float32x2_t a, float32x2_t b) {
   return vrecps_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrecpsq_f32
-// CHECK: vrecps.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrecpsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRECPSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRECPSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRECPSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> [[VRECPSQ_V_I]], <4 x float> [[VRECPSQ_V1_I]]) #4
+// CHECK:   [[VRECPSQ_V3_I:%.*]] = bitcast <4 x float> [[VRECPSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRECPSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrecpsq_f32(float32x4_t a, float32x4_t b) {
   return vrecpsq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vreinterpret_s8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s16(int16x4_t a) {
   return vreinterpret_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s32(int32x2_t a) {
   return vreinterpret_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_s64(int64x1_t a) {
   return vreinterpret_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_u8(uint8x8_t a) {
   return vreinterpret_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u16(uint16x4_t a) {
   return vreinterpret_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u32(uint32x2_t a) {
   return vreinterpret_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_u64(uint64x1_t a) {
   return vreinterpret_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f16(float16x4_t a) {
   return vreinterpret_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_f32(float32x2_t a) {
   return vreinterpret_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 int8x8_t test_vreinterpret_s8_p8(poly8x8_t a) {
   return vreinterpret_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_s8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vreinterpret_s8_p16(poly16x4_t a) {
   return vreinterpret_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s8(int8x8_t a) {
   return vreinterpret_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s32(int32x2_t a) {
   return vreinterpret_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_s64(int64x1_t a) {
   return vreinterpret_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u8(uint8x8_t a) {
   return vreinterpret_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_u16(uint16x4_t a) {
   return vreinterpret_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u32(uint32x2_t a) {
   return vreinterpret_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_u64(uint64x1_t a) {
   return vreinterpret_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f16(float16x4_t a) {
   return vreinterpret_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_f32(float32x2_t a) {
   return vreinterpret_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 int16x4_t test_vreinterpret_s16_p8(poly8x8_t a) {
   return vreinterpret_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_s16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 int16x4_t test_vreinterpret_s16_p16(poly16x4_t a) {
   return vreinterpret_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s8(int8x8_t a) {
   return vreinterpret_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s16(int16x4_t a) {
   return vreinterpret_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_s64(int64x1_t a) {
   return vreinterpret_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u8(uint8x8_t a) {
   return vreinterpret_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u16(uint16x4_t a) {
   return vreinterpret_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 int32x2_t test_vreinterpret_s32_u32(uint32x2_t a) {
   return vreinterpret_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_u64(uint64x1_t a) {
   return vreinterpret_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f16(float16x4_t a) {
   return vreinterpret_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_f32(float32x2_t a) {
   return vreinterpret_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p8(poly8x8_t a) {
   return vreinterpret_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_s32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 int32x2_t test_vreinterpret_s32_p16(poly16x4_t a) {
   return vreinterpret_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s8(int8x8_t a) {
   return vreinterpret_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s16(int16x4_t a) {
   return vreinterpret_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_s32(int32x2_t a) {
   return vreinterpret_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u8(uint8x8_t a) {
   return vreinterpret_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u16(uint16x4_t a) {
   return vreinterpret_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_u32(uint32x2_t a) {
   return vreinterpret_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_u64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_u64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 int64x1_t test_vreinterpret_s64_u64(uint64x1_t a) {
   return vreinterpret_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f16(float16x4_t a) {
   return vreinterpret_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_f32(float32x2_t a) {
   return vreinterpret_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p8(poly8x8_t a) {
   return vreinterpret_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_s64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_s64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 int64x1_t test_vreinterpret_s64_p16(poly16x4_t a) {
   return vreinterpret_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_s8(int8x8_t a) {
   return vreinterpret_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s16(int16x4_t a) {
   return vreinterpret_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s32(int32x2_t a) {
   return vreinterpret_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_s64(int64x1_t a) {
   return vreinterpret_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u16(uint16x4_t a) {
   return vreinterpret_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u32(uint32x2_t a) {
   return vreinterpret_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_u64(uint64x1_t a) {
   return vreinterpret_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f16(float16x4_t a) {
   return vreinterpret_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_f32(float32x2_t a) {
   return vreinterpret_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 uint8x8_t test_vreinterpret_u8_p8(poly8x8_t a) {
   return vreinterpret_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_u8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vreinterpret_u8_p16(poly16x4_t a) {
   return vreinterpret_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s8(int8x8_t a) {
   return vreinterpret_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_s16(int16x4_t a) {
   return vreinterpret_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s32(int32x2_t a) {
   return vreinterpret_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_s64(int64x1_t a) {
   return vreinterpret_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u8(uint8x8_t a) {
   return vreinterpret_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u32(uint32x2_t a) {
   return vreinterpret_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_u64(uint64x1_t a) {
   return vreinterpret_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f16(float16x4_t a) {
   return vreinterpret_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_f32(float32x2_t a) {
   return vreinterpret_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 uint16x4_t test_vreinterpret_u16_p8(poly8x8_t a) {
   return vreinterpret_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u16_p16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_u16_p16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 uint16x4_t test_vreinterpret_u16_p16(poly16x4_t a) {
   return vreinterpret_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s8(int8x8_t a) {
   return vreinterpret_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s16(int16x4_t a) {
   return vreinterpret_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s32(<2 x i32> %a) #0 {
+// CHECK:   ret <2 x i32> %a
 uint32x2_t test_vreinterpret_u32_s32(int32x2_t a) {
   return vreinterpret_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_s64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_s64(int64x1_t a) {
   return vreinterpret_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u8(uint8x8_t a) {
   return vreinterpret_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u16(uint16x4_t a) {
   return vreinterpret_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_u64
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_u64(uint64x1_t a) {
   return vreinterpret_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f16(float16x4_t a) {
   return vreinterpret_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_f32
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_f32(float32x2_t a) {
   return vreinterpret_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p8
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p8(poly8x8_t a) {
   return vreinterpret_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u32_p16
+// CHECK-LABEL: define <2 x i32> @test_vreinterpret_u32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP0]]
 uint32x2_t test_vreinterpret_u32_p16(poly16x4_t a) {
   return vreinterpret_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s8(int8x8_t a) {
   return vreinterpret_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s16(int16x4_t a) {
   return vreinterpret_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_s32(int32x2_t a) {
   return vreinterpret_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_s64
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_s64(<1 x i64> %a) #0 {
+// CHECK:   ret <1 x i64> %a
 uint64x1_t test_vreinterpret_u64_s64(int64x1_t a) {
   return vreinterpret_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u8(uint8x8_t a) {
   return vreinterpret_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u16(uint16x4_t a) {
   return vreinterpret_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_u32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_u32(uint32x2_t a) {
   return vreinterpret_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f16(float16x4_t a) {
   return vreinterpret_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_f32
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_f32(float32x2_t a) {
   return vreinterpret_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p8
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p8(poly8x8_t a) {
   return vreinterpret_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_u64_p16
+// CHECK-LABEL: define <1 x i64> @test_vreinterpret_u64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP0]]
 uint64x1_t test_vreinterpret_u64_p16(poly16x4_t a) {
   return vreinterpret_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s8(int8x8_t a) {
   return vreinterpret_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s16(int16x4_t a) {
   return vreinterpret_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s32(int32x2_t a) {
   return vreinterpret_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_s64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_s64(int64x1_t a) {
   return vreinterpret_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u8(uint8x8_t a) {
   return vreinterpret_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u16(uint16x4_t a) {
   return vreinterpret_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u32(uint32x2_t a) {
   return vreinterpret_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_u64
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_u64(uint64x1_t a) {
   return vreinterpret_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_f32
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_f32(float32x2_t a) {
   return vreinterpret_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p8
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p8(poly8x8_t a) {
   return vreinterpret_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f16_p16
+// CHECK-LABEL: define <4 x half> @test_vreinterpret_f16_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <4 x half>
+// CHECK:   ret <4 x half> [[TMP0]]
 float16x4_t test_vreinterpret_f16_p16(poly16x4_t a) {
   return vreinterpret_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s8(int8x8_t a) {
   return vreinterpret_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s16(int16x4_t a) {
   return vreinterpret_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s32(int32x2_t a) {
   return vreinterpret_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_s64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_s64(int64x1_t a) {
   return vreinterpret_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u8(uint8x8_t a) {
   return vreinterpret_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u16(uint16x4_t a) {
   return vreinterpret_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u32
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u32(uint32x2_t a) {
   return vreinterpret_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_u64
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_u64(uint64x1_t a) {
   return vreinterpret_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_f16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_f16(float16x4_t a) {
   return vreinterpret_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p8
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p8(poly8x8_t a) {
   return vreinterpret_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_f32_p16
+// CHECK-LABEL: define <2 x float> @test_vreinterpret_f32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <2 x float>
+// CHECK:   ret <2 x float> [[TMP0]]
 float32x2_t test_vreinterpret_f32_p16(poly16x4_t a) {
   return vreinterpret_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_s8(int8x8_t a) {
   return vreinterpret_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s16(int16x4_t a) {
   return vreinterpret_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s32(int32x2_t a) {
   return vreinterpret_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_s64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_s64(int64x1_t a) {
   return vreinterpret_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u8
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u8(<8 x i8> %a) #0 {
+// CHECK:   ret <8 x i8> %a
 poly8x8_t test_vreinterpret_p8_u8(uint8x8_t a) {
   return vreinterpret_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u16(uint16x4_t a) {
   return vreinterpret_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u32(uint32x2_t a) {
   return vreinterpret_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_u64
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_u64(uint64x1_t a) {
   return vreinterpret_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f16(float16x4_t a) {
   return vreinterpret_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_f32
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_f32(float32x2_t a) {
   return vreinterpret_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p8_p16
+// CHECK-LABEL: define <8 x i8> @test_vreinterpret_p8_p16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   ret <8 x i8> [[TMP0]]
 poly8x8_t test_vreinterpret_p8_p16(poly16x4_t a) {
   return vreinterpret_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s8(int8x8_t a) {
   return vreinterpret_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_s16(int16x4_t a) {
   return vreinterpret_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s32(int32x2_t a) {
   return vreinterpret_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_s64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_s64(int64x1_t a) {
   return vreinterpret_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u8(uint8x8_t a) {
   return vreinterpret_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u16(<4 x i16> %a) #0 {
+// CHECK:   ret <4 x i16> %a
 poly16x4_t test_vreinterpret_p16_u16(uint16x4_t a) {
   return vreinterpret_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u32(uint32x2_t a) {
   return vreinterpret_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_u64
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_u64(uint64x1_t a) {
   return vreinterpret_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f16
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f16(<4 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x half> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f16(float16x4_t a) {
   return vreinterpret_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_f32
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_f32(float32x2_t a) {
   return vreinterpret_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpret_p16_p8
+// CHECK-LABEL: define <4 x i16> @test_vreinterpret_p16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i8> %a to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP0]]
 poly16x4_t test_vreinterpret_p16_p8(poly8x8_t a) {
   return vreinterpret_p16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s16(int16x8_t a) {
   return vreinterpretq_s8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s32(int32x4_t a) {
   return vreinterpretq_s8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_s64(int64x2_t a) {
   return vreinterpretq_s8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_u8(uint8x16_t a) {
   return vreinterpretq_s8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u16(uint16x8_t a) {
   return vreinterpretq_s8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u32(uint32x4_t a) {
   return vreinterpretq_s8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_u64(uint64x2_t a) {
   return vreinterpretq_s8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f16(float16x8_t a) {
   return vreinterpretq_s8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_f32(float32x4_t a) {
   return vreinterpretq_s8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 int8x16_t test_vreinterpretq_s8_p8(poly8x16_t a) {
   return vreinterpretq_s8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_s8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vreinterpretq_s8_p16(poly16x8_t a) {
   return vreinterpretq_s8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s8(int8x16_t a) {
   return vreinterpretq_s16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s32(int32x4_t a) {
   return vreinterpretq_s16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_s64(int64x2_t a) {
   return vreinterpretq_s16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u8(uint8x16_t a) {
   return vreinterpretq_s16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_u16(uint16x8_t a) {
   return vreinterpretq_s16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u32(uint32x4_t a) {
   return vreinterpretq_s16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_u64(uint64x2_t a) {
   return vreinterpretq_s16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f16(float16x8_t a) {
   return vreinterpretq_s16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_f32(float32x4_t a) {
   return vreinterpretq_s16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 int16x8_t test_vreinterpretq_s16_p8(poly8x16_t a) {
   return vreinterpretq_s16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_s16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 int16x8_t test_vreinterpretq_s16_p16(poly16x8_t a) {
   return vreinterpretq_s16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s8(int8x16_t a) {
   return vreinterpretq_s32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s16(int16x8_t a) {
   return vreinterpretq_s32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_s64(int64x2_t a) {
   return vreinterpretq_s32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u8(uint8x16_t a) {
   return vreinterpretq_s32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u16(uint16x8_t a) {
   return vreinterpretq_s32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 int32x4_t test_vreinterpretq_s32_u32(uint32x4_t a) {
   return vreinterpretq_s32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_u64(uint64x2_t a) {
   return vreinterpretq_s32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f16(float16x8_t a) {
   return vreinterpretq_s32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_f32(float32x4_t a) {
   return vreinterpretq_s32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p8(poly8x16_t a) {
   return vreinterpretq_s32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_s32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 int32x4_t test_vreinterpretq_s32_p16(poly16x8_t a) {
   return vreinterpretq_s32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s8(int8x16_t a) {
   return vreinterpretq_s64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s16(int16x8_t a) {
   return vreinterpretq_s64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_s32(int32x4_t a) {
   return vreinterpretq_s64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u8(uint8x16_t a) {
   return vreinterpretq_s64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u16(uint16x8_t a) {
   return vreinterpretq_s64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_u32(uint32x4_t a) {
   return vreinterpretq_s64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_u64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_u64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 int64x2_t test_vreinterpretq_s64_u64(uint64x2_t a) {
   return vreinterpretq_s64_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f16(float16x8_t a) {
   return vreinterpretq_s64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_f32(float32x4_t a) {
   return vreinterpretq_s64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p8(poly8x16_t a) {
   return vreinterpretq_s64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_s64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_s64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 int64x2_t test_vreinterpretq_s64_p16(poly16x8_t a) {
   return vreinterpretq_s64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_s8(int8x16_t a) {
   return vreinterpretq_u8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s16(int16x8_t a) {
   return vreinterpretq_u8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s32(int32x4_t a) {
   return vreinterpretq_u8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_s64(int64x2_t a) {
   return vreinterpretq_u8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u16(uint16x8_t a) {
   return vreinterpretq_u8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u32(uint32x4_t a) {
   return vreinterpretq_u8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_u64(uint64x2_t a) {
   return vreinterpretq_u8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f16(float16x8_t a) {
   return vreinterpretq_u8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_f32(float32x4_t a) {
   return vreinterpretq_u8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 uint8x16_t test_vreinterpretq_u8_p8(poly8x16_t a) {
   return vreinterpretq_u8_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_u8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vreinterpretq_u8_p16(poly16x8_t a) {
   return vreinterpretq_u8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s8(int8x16_t a) {
   return vreinterpretq_u16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_s16(int16x8_t a) {
   return vreinterpretq_u16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s32(int32x4_t a) {
   return vreinterpretq_u16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_s64(int64x2_t a) {
   return vreinterpretq_u16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u8(uint8x16_t a) {
   return vreinterpretq_u16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u32(uint32x4_t a) {
   return vreinterpretq_u16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_u64(uint64x2_t a) {
   return vreinterpretq_u16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f16(float16x8_t a) {
   return vreinterpretq_u16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_f32(float32x4_t a) {
   return vreinterpretq_u16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 uint16x8_t test_vreinterpretq_u16_p8(poly8x16_t a) {
   return vreinterpretq_u16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u16_p16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_u16_p16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 uint16x8_t test_vreinterpretq_u16_p16(poly16x8_t a) {
   return vreinterpretq_u16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s8(int8x16_t a) {
   return vreinterpretq_u32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s16(int16x8_t a) {
   return vreinterpretq_u32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s32(<4 x i32> %a) #0 {
+// CHECK:   ret <4 x i32> %a
 uint32x4_t test_vreinterpretq_u32_s32(int32x4_t a) {
   return vreinterpretq_u32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_s64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_s64(int64x2_t a) {
   return vreinterpretq_u32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u8(uint8x16_t a) {
   return vreinterpretq_u32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u16(uint16x8_t a) {
   return vreinterpretq_u32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_u64
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_u64(uint64x2_t a) {
   return vreinterpretq_u32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f16(float16x8_t a) {
   return vreinterpretq_u32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_f32
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_f32(float32x4_t a) {
   return vreinterpretq_u32_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p8
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p8(poly8x16_t a) {
   return vreinterpretq_u32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u32_p16
+// CHECK-LABEL: define <4 x i32> @test_vreinterpretq_u32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP0]]
 uint32x4_t test_vreinterpretq_u32_p16(poly16x8_t a) {
   return vreinterpretq_u32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s8(int8x16_t a) {
   return vreinterpretq_u64_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s16(int16x8_t a) {
   return vreinterpretq_u64_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_s32(int32x4_t a) {
   return vreinterpretq_u64_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_s64
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_s64(<2 x i64> %a) #0 {
+// CHECK:   ret <2 x i64> %a
 uint64x2_t test_vreinterpretq_u64_s64(int64x2_t a) {
   return vreinterpretq_u64_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u8(uint8x16_t a) {
   return vreinterpretq_u64_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u16(uint16x8_t a) {
   return vreinterpretq_u64_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_u32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_u32(uint32x4_t a) {
   return vreinterpretq_u64_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f16(float16x8_t a) {
   return vreinterpretq_u64_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_f32
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_f32(float32x4_t a) {
   return vreinterpretq_u64_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p8
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p8(poly8x16_t a) {
   return vreinterpretq_u64_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_u64_p16
+// CHECK-LABEL: define <2 x i64> @test_vreinterpretq_u64_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP0]]
 uint64x2_t test_vreinterpretq_u64_p16(poly16x8_t a) {
   return vreinterpretq_u64_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s8(int8x16_t a) {
   return vreinterpretq_f16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s16(int16x8_t a) {
   return vreinterpretq_f16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s32(int32x4_t a) {
   return vreinterpretq_f16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_s64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_s64(int64x2_t a) {
   return vreinterpretq_f16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u8(uint8x16_t a) {
   return vreinterpretq_f16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u16(uint16x8_t a) {
   return vreinterpretq_f16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u32(uint32x4_t a) {
   return vreinterpretq_f16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_u64
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_u64(uint64x2_t a) {
   return vreinterpretq_f16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_f32
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_f32(float32x4_t a) {
   return vreinterpretq_f16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p8
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p8(poly8x16_t a) {
   return vreinterpretq_f16_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f16_p16
+// CHECK-LABEL: define <8 x half> @test_vreinterpretq_f16_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <8 x half>
+// CHECK:   ret <8 x half> [[TMP0]]
 float16x8_t test_vreinterpretq_f16_p16(poly16x8_t a) {
   return vreinterpretq_f16_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s8(int8x16_t a) {
   return vreinterpretq_f32_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s16(int16x8_t a) {
   return vreinterpretq_f32_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s32(int32x4_t a) {
   return vreinterpretq_f32_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_s64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_s64(int64x2_t a) {
   return vreinterpretq_f32_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u8(uint8x16_t a) {
   return vreinterpretq_f32_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u16(uint16x8_t a) {
   return vreinterpretq_f32_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u32
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u32(uint32x4_t a) {
   return vreinterpretq_f32_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_u64
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_u64(uint64x2_t a) {
   return vreinterpretq_f32_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_f16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_f16(float16x8_t a) {
   return vreinterpretq_f32_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p8
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p8(poly8x16_t a) {
   return vreinterpretq_f32_p8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_f32_p16
+// CHECK-LABEL: define <4 x float> @test_vreinterpretq_f32_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <4 x float>
+// CHECK:   ret <4 x float> [[TMP0]]
 float32x4_t test_vreinterpretq_f32_p16(poly16x8_t a) {
   return vreinterpretq_f32_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_s8(int8x16_t a) {
   return vreinterpretq_p8_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s16(int16x8_t a) {
   return vreinterpretq_p8_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s32(int32x4_t a) {
   return vreinterpretq_p8_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_s64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_s64(int64x2_t a) {
   return vreinterpretq_p8_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u8
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u8(<16 x i8> %a) #0 {
+// CHECK:   ret <16 x i8> %a
 poly8x16_t test_vreinterpretq_p8_u8(uint8x16_t a) {
   return vreinterpretq_p8_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u16(uint16x8_t a) {
   return vreinterpretq_p8_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u32(uint32x4_t a) {
   return vreinterpretq_p8_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_u64
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_u64(uint64x2_t a) {
   return vreinterpretq_p8_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f16(float16x8_t a) {
   return vreinterpretq_p8_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_f32
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_f32(float32x4_t a) {
   return vreinterpretq_p8_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p8_p16
+// CHECK-LABEL: define <16 x i8> @test_vreinterpretq_p8_p16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   ret <16 x i8> [[TMP0]]
 poly8x16_t test_vreinterpretq_p8_p16(poly16x8_t a) {
   return vreinterpretq_p8_p16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s8(int8x16_t a) {
   return vreinterpretq_p16_s8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_s16(int16x8_t a) {
   return vreinterpretq_p16_s16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s32(int32x4_t a) {
   return vreinterpretq_p16_s32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_s64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_s64(int64x2_t a) {
   return vreinterpretq_p16_s64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u8(uint8x16_t a) {
   return vreinterpretq_p16_u8(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u16(<8 x i16> %a) #0 {
+// CHECK:   ret <8 x i16> %a
 poly16x8_t test_vreinterpretq_p16_u16(uint16x8_t a) {
   return vreinterpretq_p16_u16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u32(uint32x4_t a) {
   return vreinterpretq_p16_u32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_u64
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_u64(uint64x2_t a) {
   return vreinterpretq_p16_u64(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f16
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f16(<8 x half> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x half> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f16(float16x8_t a) {
   return vreinterpretq_p16_f16(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_f32
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_f32(float32x4_t a) {
   return vreinterpretq_p16_f32(a);
 }
 
-// CHECK-LABEL: test_vreinterpretq_p16_p8
+// CHECK-LABEL: define <8 x i16> @test_vreinterpretq_p16_p8(<16 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <16 x i8> %a to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP0]]
 poly16x8_t test_vreinterpretq_p16_p8(poly8x16_t a) {
   return vreinterpretq_p16_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev16_s8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev16_s8(int8x8_t a) {
   return vrev16_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16_u8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev16_u8(uint8x8_t a) {
   return vrev16_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16_p8
-// CHECK: vrev16.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev16_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev16_p8(poly8x8_t a) {
   return vrev16_p8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_s8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev16q_s8(int8x16_t a) {
   return vrev16q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_u8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev16q_u8(uint8x16_t a) {
   return vrev16q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev16q_p8
-// CHECK: vrev16.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev16q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev16q_p8(poly8x16_t a) {
   return vrev16q_p8(a);
 }
 
 
-// CHECK-LABEL: test_vrev32_s8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev32_s8(int8x8_t a) {
   return vrev32_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32_s16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev32_s16(int16x4_t a) {
   return vrev32_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32_u8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev32_u8(uint8x8_t a) {
   return vrev32_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32_u16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev32_u16(uint16x4_t a) {
   return vrev32_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32_p8
-// CHECK: vrev32.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev32_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev32_p8(poly8x8_t a) {
   return vrev32_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32_p16
-// CHECK: vrev32.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev32_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev32_p16(poly16x4_t a) {
   return vrev32_p16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev32q_s8(int8x16_t a) {
   return vrev32q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_s16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev32q_s16(int16x8_t a) {
   return vrev32q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev32q_u8(uint8x16_t a) {
   return vrev32q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_u16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev32q_u16(uint16x8_t a) {
   return vrev32q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p8
-// CHECK: vrev32.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev32q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev32q_p8(poly8x16_t a) {
   return vrev32q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev32q_p16
-// CHECK: vrev32.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev32q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev32q_p16(poly16x8_t a) {
   return vrev32q_p16(a);
 }
 
 
-// CHECK-LABEL: test_vrev64_s8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_s8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 int8x8_t test_vrev64_s8(int8x8_t a) {
   return vrev64_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64_s16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_s16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 int16x4_t test_vrev64_s16(int16x4_t a) {
   return vrev64_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64_s32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_s32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 int32x2_t test_vrev64_s32(int32x2_t a) {
   return vrev64_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64_u8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_u8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 uint8x8_t test_vrev64_u8(uint8x8_t a) {
   return vrev64_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64_u16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_u16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 uint16x4_t test_vrev64_u16(uint16x4_t a) {
   return vrev64_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64_u32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrev64_u32(<2 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x i32> [[SHUFFLE_I]]
 uint32x2_t test_vrev64_u32(uint32x2_t a) {
   return vrev64_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64_p8
-// CHECK: vrev64.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrev64_p8(<8 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %a, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <8 x i8> [[SHUFFLE_I]]
 poly8x8_t test_vrev64_p8(poly8x8_t a) {
   return vrev64_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64_p16
-// CHECK: vrev64.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrev64_p16(<4 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+// CHECK:   ret <4 x i16> [[SHUFFLE_I]]
 poly16x4_t test_vrev64_p16(poly16x4_t a) {
   return vrev64_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64_f32
-// CHECK: vrev64.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrev64_f32(<2 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> <i32 1, i32 0>
+// CHECK:   ret <2 x float> [[SHUFFLE_I]]
 float32x2_t test_vrev64_f32(float32x2_t a) {
   return vrev64_f32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_s8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 int8x16_t test_vrev64q_s8(int8x16_t a) {
   return vrev64q_s8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_s16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 int16x8_t test_vrev64q_s16(int16x8_t a) {
   return vrev64q_s16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_s32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_s32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 int32x4_t test_vrev64q_s32(int32x4_t a) {
   return vrev64q_s32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_u8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 uint8x16_t test_vrev64q_u8(uint8x16_t a) {
   return vrev64q_u8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_u16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 uint16x8_t test_vrev64q_u16(uint16x8_t a) {
   return vrev64q_u16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_u32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrev64q_u32(<4 x i32> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> %a, <4 x i32> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x i32> [[SHUFFLE_I]]
 uint32x4_t test_vrev64q_u32(uint32x4_t a) {
   return vrev64q_u32(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p8
-// CHECK: vrev64.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrev64q_p8(<16 x i8> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %a, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+// CHECK:   ret <16 x i8> [[SHUFFLE_I]]
 poly8x16_t test_vrev64q_p8(poly8x16_t a) {
   return vrev64q_p8(a);
 }
 
-// CHECK-LABEL: test_vrev64q_p16
-// CHECK: vrev64.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrev64q_p16(<8 x i16> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> %a, <8 x i16> %a, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+// CHECK:   ret <8 x i16> [[SHUFFLE_I]]
 poly16x8_t test_vrev64q_p16(poly16x8_t a) {
   return vrev64q_p16(a);
 }
 
-// CHECK-LABEL: test_vrev64q_f32
-// CHECK: vrev64.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrev64q_f32(<4 x float> %a) #0 {
+// CHECK:   [[SHUFFLE_I:%.*]] = shufflevector <4 x float> %a, <4 x float> %a, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+// CHECK:   ret <4 x float> [[SHUFFLE_I]]
 float32x4_t test_vrev64q_f32(float32x4_t a) {
   return vrev64q_f32(a);
 }
 
 
-// CHECK-LABEL: test_vrhadd_s8
-// CHECK: vrhadd.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 int8x8_t test_vrhadd_s8(int8x8_t a, int8x8_t b) {
   return vrhadd_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s16
-// CHECK: vrhadd.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrhadd_s16(int16x4_t a, int16x4_t b) {
   return vrhadd_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_s32
-// CHECK: vrhadd.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrhadd_s32(int32x2_t a, int32x2_t b) {
   return vrhadd_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u8
-// CHECK: vrhadd.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrhadd_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRHADD_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRHADD_V_I]]
 uint8x8_t test_vrhadd_u8(uint8x8_t a, uint8x8_t b) {
   return vrhadd_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u16
-// CHECK: vrhadd.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrhadd_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> [[VRHADD_V_I]], <4 x i16> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <4 x i16> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrhadd_u16(uint16x4_t a, uint16x4_t b) {
   return vrhadd_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhadd_u32
-// CHECK: vrhadd.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrhadd_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRHADD_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRHADD_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRHADD_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> [[VRHADD_V_I]], <2 x i32> [[VRHADD_V1_I]]) #4
+// CHECK:   [[VRHADD_V3_I:%.*]] = bitcast <2 x i32> [[VRHADD_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRHADD_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrhadd_u32(uint32x2_t a, uint32x2_t b) {
   return vrhadd_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s8
-// CHECK: vrhadd.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 int8x16_t test_vrhaddq_s8(int8x16_t a, int8x16_t b) {
   return vrhaddq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s16
-// CHECK: vrhadd.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrhaddq_s16(int16x8_t a, int16x8_t b) {
   return vrhaddq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_s32
-// CHECK: vrhadd.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrhaddq_s32(int32x4_t a, int32x4_t b) {
   return vrhaddq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u8
-// CHECK: vrhadd.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrhaddq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRHADDQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRHADDQ_V_I]]
 uint8x16_t test_vrhaddq_u8(uint8x16_t a, uint8x16_t b) {
   return vrhaddq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u16
-// CHECK: vrhadd.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrhaddq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> [[VRHADDQ_V_I]], <8 x i16> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <8 x i16> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrhaddq_u16(uint16x8_t a, uint16x8_t b) {
   return vrhaddq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrhaddq_u32
-// CHECK: vrhadd.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrhaddq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRHADDQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRHADDQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> [[VRHADDQ_V_I]], <4 x i32> [[VRHADDQ_V1_I]]) #4
+// CHECK:   [[VRHADDQ_V3_I:%.*]] = bitcast <4 x i32> [[VRHADDQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRHADDQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrhaddq_u32(uint32x4_t a, uint32x4_t b) {
   return vrhaddq_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vrshl_s8
-// CHECK: vrshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 int8x8_t test_vrshl_s8(int8x8_t a, int8x8_t b) {
   return vrshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s16
-// CHECK: vrshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrshl_s16(int16x4_t a, int16x4_t b) {
   return vrshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s32
-// CHECK: vrshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrshl_s32(int32x2_t a, int32x2_t b) {
   return vrshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_s64
-// CHECK: vrshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vrshl_s64(int64x1_t a, int64x1_t b) {
   return vrshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u8
-// CHECK: vrshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VRSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VRSHL_V_I]]
 uint8x8_t test_vrshl_u8(uint8x8_t a, int8x8_t b) {
   return vrshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u16
-// CHECK: vrshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHL_V_I]], <4 x i16> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <4 x i16> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrshl_u16(uint16x4_t a, int16x4_t b) {
   return vrshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u32
-// CHECK: vrshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHL_V_I]], <2 x i32> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <2 x i32> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrshl_u32(uint32x2_t a, int32x2_t b) {
   return vrshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshl_u64
-// CHECK: vrshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VRSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VRSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHL_V_I]], <1 x i64> [[VRSHL_V1_I]]) #4
+// CHECK:   [[VRSHL_V3_I:%.*]] = bitcast <1 x i64> [[VRSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vrshl_u64(uint64x1_t a, int64x1_t b) {
   return vrshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s8
-// CHECK: vrshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 int8x16_t test_vrshlq_s8(int8x16_t a, int8x16_t b) {
   return vrshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s16
-// CHECK: vrshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vrshlq_s16(int16x8_t a, int16x8_t b) {
   return vrshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s32
-// CHECK: vrshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vrshlq_s32(int32x4_t a, int32x4_t b) {
   return vrshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_s64
-// CHECK: vrshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vrshlq_s64(int64x2_t a, int64x2_t b) {
   return vrshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u8
-// CHECK: vrshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VRSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VRSHLQ_V_I]]
 uint8x16_t test_vrshlq_u8(uint8x16_t a, int8x16_t b) {
   return vrshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u16
-// CHECK: vrshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHLQ_V_I]], <8 x i16> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vrshlq_u16(uint16x8_t a, int16x8_t b) {
   return vrshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u32
-// CHECK: vrshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHLQ_V_I]], <4 x i32> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vrshlq_u32(uint32x4_t a, int32x4_t b) {
   return vrshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrshlq_u64
-// CHECK: vrshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHLQ_V_I]], <2 x i64> [[VRSHLQ_V1_I]]) #4
+// CHECK:   [[VRSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VRSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vrshlq_u64(uint64x2_t a, int64x2_t b) {
   return vrshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vrshrn_n_s16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 int8x8_t test_vrshrn_n_s16(int16x8_t a) {
   return vrshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 int16x4_t test_vrshrn_n_s32(int32x4_t a) {
   return vrshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_s64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 int32x2_t test_vrshrn_n_s64(int64x2_t a) {
   return vrshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u16
-// CHECK: vrshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> [[VRSHRN_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i8> [[VRSHRN_N1]]
 uint8x8_t test_vrshrn_n_u16(uint16x8_t a) {
   return vrshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u32
-// CHECK: vrshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> [[VRSHRN_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i16> [[VRSHRN_N1]]
 uint16x4_t test_vrshrn_n_u32(uint32x4_t a) {
   return vrshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrn_n_u64
-// CHECK: vrshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHRN_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHRN_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> [[VRSHRN_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i32> [[VRSHRN_N1]]
 uint32x2_t test_vrshrn_n_u64(uint64x2_t a) {
   return vrshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vrshr_n_s8
-// CHECK: vrshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 int8x8_t test_vrshr_n_s8(int8x8_t a) {
   return vrshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s16
-// CHECK: vrshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 int16x4_t test_vrshr_n_s16(int16x4_t a) {
   return vrshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s32
-// CHECK: vrshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 int32x2_t test_vrshr_n_s32(int32x2_t a) {
   return vrshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_s64
-// CHECK: vrshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 int64x1_t test_vrshr_n_s64(int64x1_t a) {
   return vrshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u8
-// CHECK: vrshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %a, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VRSHR_N]]
 uint8x8_t test_vrshr_n_u8(uint8x8_t a) {
   return vrshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u16
-// CHECK: vrshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[VRSHR_N]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VRSHR_N1]]
 uint16x4_t test_vrshr_n_u16(uint16x4_t a) {
   return vrshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u32
-// CHECK: vrshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[VRSHR_N]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VRSHR_N1]]
 uint32x2_t test_vrshr_n_u32(uint32x2_t a) {
   return vrshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshr_n_u64
-// CHECK: vrshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[VRSHR_N]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VRSHR_N1]]
 uint64x1_t test_vrshr_n_u64(uint64x1_t a) {
   return vrshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s8
-// CHECK: vrshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 int8x16_t test_vrshrq_n_s8(int8x16_t a) {
   return vrshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s16
-// CHECK: vrshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 int16x8_t test_vrshrq_n_s16(int16x8_t a) {
   return vrshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s32
-// CHECK: vrshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 int32x4_t test_vrshrq_n_s32(int32x4_t a) {
   return vrshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_s64
-// CHECK: vrshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 int64x2_t test_vrshrq_n_s64(int64x2_t a) {
   return vrshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u8
-// CHECK: vrshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VRSHR_N:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %a, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VRSHR_N]]
 uint8x16_t test_vrshrq_n_u8(uint8x16_t a) {
   return vrshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u16
-// CHECK: vrshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSHR_N1:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[VRSHR_N]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VRSHR_N1]]
 uint16x8_t test_vrshrq_n_u16(uint16x8_t a) {
   return vrshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u32
-// CHECK: vrshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSHR_N1:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[VRSHR_N]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VRSHR_N1]]
 uint32x4_t test_vrshrq_n_u32(uint32x4_t a) {
   return vrshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vrshrq_n_u64
-// CHECK: vrshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[VRSHR_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSHR_N1:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[VRSHR_N]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VRSHR_N1]]
 uint64x2_t test_vrshrq_n_u64(uint64x2_t a) {
   return vrshrq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vrsqrte_f32
-// CHECK: vrsqrte.f32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrsqrte_f32(<2 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x float> [[VRSQRTE_V1_I]]
 float32x2_t test_vrsqrte_f32(float32x2_t a) {
   return vrsqrte_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrte_u32
-// CHECK: vrsqrte.u32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsqrte_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[VRSQRTE_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VRSQRTE_V1_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> [[VRSQRTE_V_I]]) #4
+// CHECK:   ret <2 x i32> [[VRSQRTE_V1_I]]
 uint32x2_t test_vrsqrte_u32(uint32x2_t a) {
   return vrsqrte_u32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_f32
-// CHECK: vrsqrte.f32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrsqrteq_f32(<4 x float> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x float> [[VRSQRTEQ_V1_I]]
 float32x4_t test_vrsqrteq_f32(float32x4_t a) {
   return vrsqrteq_f32(a);
 }
 
-// CHECK-LABEL: test_vrsqrteq_u32
-// CHECK: vrsqrte.u32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsqrteq_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[VRSQRTEQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSQRTEQ_V1_I:%.*]] = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> [[VRSQRTEQ_V_I]]) #4
+// CHECK:   ret <4 x i32> [[VRSQRTEQ_V1_I]]
 uint32x4_t test_vrsqrteq_u32(uint32x4_t a) {
   return vrsqrteq_u32(a);
 }
 
 
-// CHECK-LABEL: test_vrsqrts_f32
-// CHECK: vrsqrts.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vrsqrts_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[VRSQRTS_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VRSQRTS_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[VRSQRTS_V2_I:%.*]] = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> [[VRSQRTS_V_I]], <2 x float> [[VRSQRTS_V1_I]]) #4
+// CHECK:   [[VRSQRTS_V3_I:%.*]] = bitcast <2 x float> [[VRSQRTS_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSQRTS_V3_I]] to <2 x float>
+// CHECK:   ret <2 x float> [[TMP2]]
 float32x2_t test_vrsqrts_f32(float32x2_t a, float32x2_t b) {
   return vrsqrts_f32(a, b);
 }
 
-// CHECK-LABEL: test_vrsqrtsq_f32
-// CHECK: vrsqrts.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vrsqrtsq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[VRSQRTSQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[VRSQRTSQ_V2_I:%.*]] = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> [[VRSQRTSQ_V_I]], <4 x float> [[VRSQRTSQ_V1_I]]) #4
+// CHECK:   [[VRSQRTSQ_V3_I:%.*]] = bitcast <4 x float> [[VRSQRTSQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VRSQRTSQ_V3_I]] to <4 x float>
+// CHECK:   ret <4 x float> [[TMP2]]
 float32x4_t test_vrsqrtsq_f32(float32x4_t a, float32x4_t b) {
   return vrsqrtsq_f32(a, b);
 }
 
 
-// CHECK-LABEL: test_vrsra_n_s8
-// CHECK: vrsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
+// CHECK:   ret <8 x i8> [[VRSRA_N]]
 int8x8_t test_vrsra_n_s8(int8x8_t a, int8x8_t b) {
   return vrsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s16
-// CHECK: vrsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i16> [[VRSRA_N]]
 int16x4_t test_vrsra_n_s16(int16x4_t a, int16x4_t b) {
   return vrsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s32
-// CHECK: vrsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i32> [[VRSRA_N]]
 int32x2_t test_vrsra_n_s32(int32x2_t a, int32x2_t b) {
   return vrsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_s64
-// CHECK: vrsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <1 x i64> [[VRSRA_N]]
 int64x1_t test_vrsra_n_s64(int64x1_t a, int64x1_t b) {
   return vrsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u8
-// CHECK: vrsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i8> %a, [[TMP0]]
+// CHECK:   ret <8 x i8> [[VRSRA_N]]
 uint8x8_t test_vrsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vrsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u16
-// CHECK: vrsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> [[TMP3]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i16> [[VRSRA_N]]
 uint16x4_t test_vrsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vrsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u32
-// CHECK: vrsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> [[TMP3]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i32> [[VRSRA_N]]
 uint32x2_t test_vrsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vrsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsra_n_u64
-// CHECK: vrsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vrsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP4:%.*]] = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> [[TMP3]], <1 x i64> <i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <1 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <1 x i64> [[VRSRA_N]]
 uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s8
-// CHECK: vrsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
+// CHECK:   ret <16 x i8> [[VRSRA_N]]
 int8x16_t test_vrsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vrsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s16
-// CHECK: vrsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i16> [[VRSRA_N]]
 int16x8_t test_vrsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vrsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s32
-// CHECK: vrsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i32> [[VRSRA_N]]
 int32x4_t test_vrsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vrsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_s64
-// CHECK: vrsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i64> [[VRSRA_N]]
 int64x2_t test_vrsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vrsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u8
-// CHECK: vrsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vrsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <16 x i8> %a, [[TMP0]]
+// CHECK:   ret <16 x i8> [[VRSRA_N]]
 uint8x16_t test_vrsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vrsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u16
-// CHECK: vrsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vrsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> [[TMP3]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+// CHECK:   ret <8 x i16> [[VRSRA_N]]
 uint16x8_t test_vrsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vrsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u32
-// CHECK: vrsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vrsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> [[TMP3]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+// CHECK:   ret <4 x i32> [[VRSRA_N]]
 uint32x4_t test_vrsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vrsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vrsraq_n_u64
-// CHECK: vrsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vrsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP4:%.*]] = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> [[TMP3]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   [[VRSRA_N:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+// CHECK:   ret <2 x i64> [[VRSRA_N]]
 uint64x2_t test_vrsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vrsraq_n_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vrsubhn_s16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 int8x8_t test_vrsubhn_s16(int16x8_t a, int16x8_t b) {
   return vrsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vrsubhn_s32(int32x4_t a, int32x4_t b) {
   return vrsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_s64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vrsubhn_s64(int64x2_t a, int64x2_t b) {
   return vrsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u16
-// CHECK: vrsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vrsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> [[VRSUBHN_V_I]], <8 x i16> [[VRSUBHN_V1_I]]) #4
+// CHECK:   ret <8 x i8> [[VRSUBHN_V2_I]]
 uint8x8_t test_vrsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vrsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u32
-// CHECK: vrsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vrsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> [[VRSUBHN_V_I]], <4 x i32> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <4 x i16> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vrsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vrsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vrsubhn_u64
-// CHECK: vrsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vrsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VRSUBHN_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VRSUBHN_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> [[VRSUBHN_V_I]], <2 x i64> [[VRSUBHN_V1_I]]) #4
+// CHECK:   [[VRSUBHN_V3_I:%.*]] = bitcast <2 x i32> [[VRSUBHN_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VRSUBHN_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vrsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vrsubhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vset_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 zeroext %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) {
   return vset_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_u16(i16 zeroext %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 uint16x4_t test_vset_lane_u16(uint16_t a, uint16x4_t b) {
   return vset_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_u32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_u32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 uint32x2_t test_vset_lane_u32(uint32_t a, uint32x2_t b) {
   return vset_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_s8(i8 signext %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 int8x8_t test_vset_lane_s8(int8_t a, int8x8_t b) {
   return vset_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_s16(i16 signext %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 int16x4_t test_vset_lane_s16(int16_t a, int16x4_t b) {
   return vset_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x i32> @test_vset_lane_s32(i32 %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i32> [[TMP1]], i32 %a, i32 1
+// CHECK:   ret <2 x i32> [[VSET_LANE]]
 int32x2_t test_vset_lane_s32(int32_t a, int32x2_t b) {
   return vset_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i8> @test_vset_lane_p8(i8 signext %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7
+// CHECK:   ret <8 x i8> [[VSET_LANE]]
 poly8x8_t test_vset_lane_p8(poly8_t a, poly8x8_t b) {
   return vset_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vset_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i16> @test_vset_lane_p16(i16 signext %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP1]], i16 %a, i32 3
+// CHECK:   ret <4 x i16> [[VSET_LANE]]
 poly16x4_t test_vset_lane_p16(poly16_t a, poly16x4_t b) {
   return vset_lane_p16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_f32
-// CHECK: mov 
+// CHECK-LABEL: define <2 x float> @test_vset_lane_f32(float %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x float> [[TMP1]], float %a, i32 1
+// CHECK:   ret <2 x float> [[VSET_LANE]]
 float32x2_t test_vset_lane_f32(float32_t a, float32x2_t b) {
   return vset_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vset_lane_f16
-// CHECK: mov 
+// CHECK-LABEL: define <4 x half> @test_vset_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[__REINT_246:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_246:%.*]] = alloca <4 x half>, align 8
+// CHECK:   [[__REINT2_246:%.*]] = alloca <4 x i16>, align 8
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_246]], align 2
+// CHECK:   store <4 x half> %b, <4 x half>* [[__REINT1_246]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_246]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <4 x half>* [[__REINT1_246]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP3]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[TMP2]], i32 1
+// CHECK:   store <4 x i16> [[VSET_LANE]], <4 x i16>* [[__REINT2_246]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16>* [[__REINT2_246]] to <4 x half>*
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[TMP7]], align 8
+// CHECK:   ret <4 x half> [[TMP8]]
 float16x4_t test_vset_lane_f16(float16_t *a, float16x4_t b) {
   return vset_lane_f16(*a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_u8(i8 zeroext %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 uint8x16_t test_vsetq_lane_u8(uint8_t a, uint8x16_t b) {
   return vsetq_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_u16(i16 zeroext %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 uint16x8_t test_vsetq_lane_u16(uint16_t a, uint16x8_t b) {
   return vsetq_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_u32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 uint32x4_t test_vsetq_lane_u32(uint32_t a, uint32x4_t b) {
   return vsetq_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_s8(i8 signext %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 int8x16_t test_vsetq_lane_s8(int8_t a, int8x16_t b) {
   return vsetq_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_s16(i16 signext %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 int16x8_t test_vsetq_lane_s16(int16_t a, int16x8_t b) {
   return vsetq_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x i32> @test_vsetq_lane_s32(i32 %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x i32> [[TMP1]], i32 %a, i32 3
+// CHECK:   ret <4 x i32> [[VSET_LANE]]
 int32x4_t test_vsetq_lane_s32(int32_t a, int32x4_t b) {
   return vsetq_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p8
-// CHECK: vmov 
+// CHECK-LABEL: define <16 x i8> @test_vsetq_lane_p8(i8 signext %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <16 x i8> %b, i8 %a, i32 15
+// CHECK:   ret <16 x i8> [[VSET_LANE]]
 poly8x16_t test_vsetq_lane_p8(poly8_t a, poly8x16_t b) {
   return vsetq_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vsetq_lane_p16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x i16> @test_vsetq_lane_p16(i16 signext %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP1]], i16 %a, i32 7
+// CHECK:   ret <8 x i16> [[VSET_LANE]]
 poly16x8_t test_vsetq_lane_p16(poly16_t a, poly16x8_t b) {
   return vsetq_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f32
-// CHECK: vmov 
+// CHECK-LABEL: define <4 x float> @test_vsetq_lane_f32(float %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 3
+// CHECK:   ret <4 x float> [[VSET_LANE]]
 float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
   return vsetq_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vsetq_lane_f16
-// CHECK: vmov 
+// CHECK-LABEL: define <8 x half> @test_vsetq_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
+// CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
+// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
 }
 
-// CHECK-LABEL: test_vset_lane_s64
 // The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_s64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 int64x1_t test_vset_lane_s64(int64_t a, int64x1_t b) {
   return vset_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vset_lane_u64
 // The optimizer is able to get rid of all moves now.
+// CHECK-LABEL: define <1 x i64> @test_vset_lane_u64(i64 %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <1 x i64> [[TMP1]], i64 %a, i32 0
+// CHECK:   ret <1 x i64> [[VSET_LANE]]
 uint64x1_t test_vset_lane_u64(uint64_t a, uint64x1_t b) {
   return vset_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vsetq_lane_s64
-// CHECK: vmov 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_s64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 int64x2_t test_vsetq_lane_s64(int64_t a, int64x2_t b) {
   return vsetq_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsetq_lane_u64
-// CHECK: vmov 
+// CHECK-LABEL: define <2 x i64> @test_vsetq_lane_u64(i64 %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSET_LANE:%.*]] = insertelement <2 x i64> [[TMP1]], i64 %a, i32 1
+// CHECK:   ret <2 x i64> [[VSET_LANE]]
 uint64x2_t test_vsetq_lane_u64(uint64_t a, uint64x2_t b) {
   return vsetq_lane_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vshl_s8
-// CHECK: vshl.s8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 int8x8_t test_vshl_s8(int8x8_t a, int8x8_t b) {
   return vshl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s16
-// CHECK: vshl.s16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 int16x4_t test_vshl_s16(int16x4_t a, int16x4_t b) {
   return vshl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s32
-// CHECK: vshl.s32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 int32x2_t test_vshl_s32(int32x2_t a, int32x2_t b) {
   return vshl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_s64
-// CHECK: vshl.s64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 int64x1_t test_vshl_s64(int64x1_t a, int64x1_t b) {
   return vshl_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u8
-// CHECK: vshl.u8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSHL_V_I:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VSHL_V_I]]
 uint8x8_t test_vshl_u8(uint8x8_t a, int8x8_t b) {
   return vshl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u16
-// CHECK: vshl.u16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> [[VSHL_V_I]], <4 x i16> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <4 x i16> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[TMP2]]
 uint16x4_t test_vshl_u16(uint16x4_t a, int16x4_t b) {
   return vshl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u32
-// CHECK: vshl.u32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> [[VSHL_V_I]], <2 x i32> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <2 x i32> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[TMP2]]
 uint32x2_t test_vshl_u32(uint32x2_t a, int32x2_t b) {
   return vshl_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshl_u64
-// CHECK: vshl.u64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSHL_V_I:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_V1_I:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSHL_V2_I:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> [[VSHL_V_I]], <1 x i64> [[VSHL_V1_I]]) #4
+// CHECK:   [[VSHL_V3_I:%.*]] = bitcast <1 x i64> [[VSHL_V2_I]] to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[VSHL_V3_I]] to <1 x i64>
+// CHECK:   ret <1 x i64> [[TMP2]]
 uint64x1_t test_vshl_u64(uint64x1_t a, int64x1_t b) {
   return vshl_u64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s8
-// CHECK: vshl.s8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 int8x16_t test_vshlq_s8(int8x16_t a, int8x16_t b) {
   return vshlq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s16
-// CHECK: vshl.s16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 int16x8_t test_vshlq_s16(int16x8_t a, int16x8_t b) {
   return vshlq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s32
-// CHECK: vshl.s32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 int32x4_t test_vshlq_s32(int32x4_t a, int32x4_t b) {
   return vshlq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_s64
-// CHECK: vshl.s64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 int64x2_t test_vshlq_s64(int64x2_t a, int64x2_t b) {
   return vshlq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u8
-// CHECK: vshl.u8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSHLQ_V_I:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %a, <16 x i8> %b) #4
+// CHECK:   ret <16 x i8> [[VSHLQ_V_I]]
 uint8x16_t test_vshlq_u8(uint8x16_t a, int8x16_t b) {
   return vshlq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u16
-// CHECK: vshl.u16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> [[VSHLQ_V_I]], <8 x i16> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <8 x i16> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[TMP2]]
 uint16x8_t test_vshlq_u16(uint16x8_t a, int16x8_t b) {
   return vshlq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u32
-// CHECK: vshl.u32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> [[VSHLQ_V_I]], <4 x i32> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <4 x i32> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[TMP2]]
 uint32x4_t test_vshlq_u32(uint32x4_t a, int32x4_t b) {
   return vshlq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vshlq_u64
-// CHECK: vshl.u64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSHLQ_V_I:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHLQ_V1_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLQ_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> [[VSHLQ_V_I]], <2 x i64> [[VSHLQ_V1_I]]) #4
+// CHECK:   [[VSHLQ_V3_I:%.*]] = bitcast <2 x i64> [[VSHLQ_V2_I]] to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[VSHLQ_V3_I]] to <2 x i64>
+// CHECK:   ret <2 x i64> [[TMP2]]
 uint64x2_t test_vshlq_u64(uint64x2_t a, int64x2_t b) {
   return vshlq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vshll_n_s8
-// CHECK: vshll.s8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 int16x8_t test_vshll_n_s8(int8x8_t a) {
   return vshll_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s16
-// CHECK: vshll.s16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 int32x4_t test_vshll_n_s16(int16x4_t a) {
   return vshll_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_s32
-// CHECK: vshll.s32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 int64x2_t test_vshll_n_s32(int32x2_t a) {
   return vshll_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u8
-// CHECK: vshll.u8 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshll_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VSHLL_N:%.*]] = shl <8 x i16> [[TMP0]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHLL_N]]
 uint16x8_t test_vshll_n_u8(uint8x8_t a) {
   return vshll_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u16
-// CHECK: vshll.u16 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshll_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSHLL_N:%.*]] = shl <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHLL_N]]
 uint32x4_t test_vshll_n_u16(uint16x4_t a) {
   return vshll_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshll_n_u32
-// CHECK: vshll.u32 q{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshll_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP2:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSHLL_N:%.*]] = shl <2 x i64> [[TMP2]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHLL_N]]
 uint64x2_t test_vshll_n_u32(uint32x2_t a) {
   return vshll_n_u32(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshl_n_s8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 int8x8_t test_vshl_n_s8(int8x8_t a) {
   return vshl_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 int16x4_t test_vshl_n_s16(int16x4_t a) {
   return vshl_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 int32x2_t test_vshl_n_s32(int32x2_t a) {
   return vshl_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_s64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u8
-// CHECK: vshl.i8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshl_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHL_N]]
 uint8x8_t test_vshl_n_u8(uint8x8_t a) {
   return vshl_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u16
-// CHECK: vshl.i16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshl_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHL_N]]
 uint16x4_t test_vshl_n_u16(uint16x4_t a) {
   return vshl_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u32
-// CHECK: vshl.i32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshl_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHL_N]]
 uint32x2_t test_vshl_n_u32(uint32x2_t a) {
   return vshl_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshl_n_u64
-// CHECK: vshl.i64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshl_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHL_N]]
 uint64x1_t test_vshl_n_u64(uint64x1_t a) {
   return vshl_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 int8x16_t test_vshlq_n_s8(int8x16_t a) {
   return vshlq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 int16x8_t test_vshlq_n_s16(int16x8_t a) {
   return vshlq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 int32x4_t test_vshlq_n_s32(int32x4_t a) {
   return vshlq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_s64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 int64x2_t test_vshlq_n_s64(int64x2_t a) {
   return vshlq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u8
-// CHECK: vshl.i8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshlq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHL_N:%.*]] = shl <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHL_N]]
 uint8x16_t test_vshlq_n_u8(uint8x16_t a) {
   return vshlq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u16
-// CHECK: vshl.i16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshlq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHL_N:%.*]] = shl <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHL_N]]
 uint16x8_t test_vshlq_n_u16(uint16x8_t a) {
   return vshlq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u32
-// CHECK: vshl.i32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshlq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHL_N:%.*]] = shl <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHL_N]]
 uint32x4_t test_vshlq_n_u32(uint32x4_t a) {
   return vshlq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshlq_n_u64
-// CHECK: vshl.i64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshlq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHL_N:%.*]] = shl <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHL_N]]
 uint64x2_t test_vshlq_n_u64(uint64x2_t a) {
   return vshlq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshrn_n_s16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 int8x8_t test_vshrn_n_s16(int16x8_t a) {
   return vshrn_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 int16x4_t test_vshrn_n_s32(int32x4_t a) {
   return vshrn_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_s64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 int32x2_t test_vshrn_n_s64(int64x2_t a) {
   return vshrn_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u16
-// CHECK: vshrn.i16 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP2:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSHRN_N]]
 uint8x8_t test_vshrn_n_u16(uint16x8_t a) {
   return vshrn_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u32
-// CHECK: vshrn.i32 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSHRN_N]]
 uint16x4_t test_vshrn_n_u32(uint32x4_t a) {
   return vshrn_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrn_n_u64
-// CHECK: vshrn.i64 d{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   [[VSHRN_N:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSHRN_N]]
 uint32x2_t test_vshrn_n_u64(uint64x2_t a) {
   return vshrn_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vshr_n_s8
-// CHECK: vshr.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 int8x8_t test_vshr_n_s8(int8x8_t a) {
   return vshr_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s16
-// CHECK: vshr.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 int16x4_t test_vshr_n_s16(int16x4_t a) {
   return vshr_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s32
-// CHECK: vshr.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 int32x2_t test_vshr_n_s32(int32x2_t a) {
   return vshr_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_s64
-// CHECK: vshr.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_s64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 int64x1_t test_vshr_n_s64(int64x1_t a) {
   return vshr_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u8
-// CHECK: vshr.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <8 x i8> [[VSHR_N]]
 uint8x8_t test_vshr_n_u8(uint8x8_t a) {
   return vshr_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u16
-// CHECK: vshr.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <4 x i16> [[VSHR_N]]
 uint16x4_t test_vshr_n_u16(uint16x4_t a) {
   return vshr_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u32
-// CHECK: vshr.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i32> [[TMP1]], <i32 1, i32 1>
+// CHECK:   ret <2 x i32> [[VSHR_N]]
 uint32x2_t test_vshr_n_u32(uint32x2_t a) {
   return vshr_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshr_n_u64
-// CHECK: vshr.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vshr_n_u64(<1 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <1 x i64> [[TMP1]], <i64 1>
+// CHECK:   ret <1 x i64> [[VSHR_N]]
 uint64x1_t test_vshr_n_u64(uint64x1_t a) {
   return vshr_n_u64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s8
-// CHECK: vshr.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = ashr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 int8x16_t test_vshrq_n_s8(int8x16_t a) {
   return vshrq_n_s8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s16
-// CHECK: vshr.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = ashr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 int16x8_t test_vshrq_n_s16(int16x8_t a) {
   return vshrq_n_s16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s32
-// CHECK: vshr.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = ashr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 int32x4_t test_vshrq_n_s32(int32x4_t a) {
   return vshrq_n_s32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_s64
-// CHECK: vshr.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = ashr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 int64x2_t test_vshrq_n_s64(int64x2_t a) {
   return vshrq_n_s64(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u8
-// CHECK: vshr.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) #0 {
+// CHECK:   [[VSHR_N:%.*]] = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   ret <16 x i8> [[VSHR_N]]
 uint8x16_t test_vshrq_n_u8(uint8x16_t a) {
   return vshrq_n_u8(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u16
-// CHECK: vshr.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSHR_N:%.*]] = lshr <8 x i16> [[TMP1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   ret <8 x i16> [[VSHR_N]]
 uint16x8_t test_vshrq_n_u16(uint16x8_t a) {
   return vshrq_n_u16(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u32
-// CHECK: vshr.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSHR_N:%.*]] = lshr <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   ret <4 x i32> [[VSHR_N]]
 uint32x4_t test_vshrq_n_u32(uint32x4_t a) {
   return vshrq_n_u32(a, 1);
 }
 
-// CHECK-LABEL: test_vshrq_n_u64
-// CHECK: vshr.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSHR_N:%.*]] = lshr <2 x i64> [[TMP1]], <i64 1, i64 1>
+// CHECK:   ret <2 x i64> [[VSHR_N]]
 uint64x2_t test_vshrq_n_u64(uint64x2_t a) {
   return vshrq_n_u64(a, 1);
 }
 
 
-// CHECK-LABEL: test_vsli_n_s8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsli_n_s8(int8x8_t a, int8x8_t b) {
   return vsli_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsli_n_s16(int16x4_t a, int16x4_t b) {
   return vsli_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsli_n_s32(int32x2_t a, int32x2_t b) {
   return vsli_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_s64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsli_n_s64(int64x1_t a, int64x1_t b) {
   return vsli_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsli_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsli_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsli_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsli_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u32
-// CHECK: vsli.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsli_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 1, i32 1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsli_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsli_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_u64
-// CHECK: vsli.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsli_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsli_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsli_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p8
-// CHECK: vsli.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsli_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsli_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsli_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsli_n_p16
-// CHECK: vsli.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsli_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsli_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsli_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsliq_n_s8(int8x16_t a, int8x16_t b) {
   return vsliq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsliq_n_s16(int16x8_t a, int16x8_t b) {
   return vsliq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsliq_n_s32(int32x4_t a, int32x4_t b) {
   return vsliq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_s64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsliq_n_s64(int64x2_t a, int64x2_t b) {
   return vsliq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsliq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsliq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsliq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsliq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u32
-// CHECK: vsli.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsliq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsliq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsliq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_u64
-// CHECK: vsli.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsliq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 1, i64 1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsliq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsliq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p8
-// CHECK: vsli.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsliq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsliq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsliq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsliq_n_p16
-// CHECK: vsli.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsliq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsliq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsliq_n_p16(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vsra_n_s8
-// CHECK: vsra.s8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 int8x8_t test_vsra_n_s8(int8x8_t a, int8x8_t b) {
   return vsra_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s16
-// CHECK: vsra.s16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 int16x4_t test_vsra_n_s16(int16x4_t a, int16x4_t b) {
   return vsra_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s32
-// CHECK: vsra.s32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 int32x2_t test_vsra_n_s32(int32x2_t a, int32x2_t b) {
   return vsra_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_s64
-// CHECK: vsra.s64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 int64x1_t test_vsra_n_s64(int64x1_t a, int64x1_t b) {
   return vsra_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u8
-// CHECK: vsra.u8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <8 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <8 x i8> [[TMP0]]
 uint8x8_t test_vsra_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsra_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u16
-// CHECK: vsra.u16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i16> [[TMP4]]
 uint16x4_t test_vsra_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsra_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u32
-// CHECK: vsra.u32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i32> [[TMP3]], <i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i32> [[TMP4]]
 uint32x2_t test_vsra_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsra_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsra_n_u64
-// CHECK: vsra.u64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsra_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <1 x i64> [[TMP3]], <i64 1>
+// CHECK:   [[TMP4:%.*]] = add <1 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <1 x i64> [[TMP4]]
 uint64x1_t test_vsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s8
-// CHECK: vsra.s8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = ashr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 int8x16_t test_vsraq_n_s8(int8x16_t a, int8x16_t b) {
   return vsraq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s16
-// CHECK: vsra.s16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = ashr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 int16x8_t test_vsraq_n_s16(int16x8_t a, int16x8_t b) {
   return vsraq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s32
-// CHECK: vsra.s32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = ashr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 int32x4_t test_vsraq_n_s32(int32x4_t a, int32x4_t b) {
   return vsraq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_s64
-// CHECK: vsra.s64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = ashr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 int64x2_t test_vsraq_n_s64(int64x2_t a, int64x2_t b) {
   return vsraq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u8
-// CHECK: vsra.u8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSRA_N:%.*]] = lshr <16 x i8> %b, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+// CHECK:   [[TMP0:%.*]] = add <16 x i8> %a, [[VSRA_N]]
+// CHECK:   ret <16 x i8> [[TMP0]]
 uint8x16_t test_vsraq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsraq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u16
-// CHECK: vsra.u16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSRA_N:%.*]] = lshr <8 x i16> [[TMP3]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+// CHECK:   [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <8 x i16> [[TMP4]]
 uint16x8_t test_vsraq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsraq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u32
-// CHECK: vsra.u32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSRA_N:%.*]] = lshr <4 x i32> [[TMP3]], <i32 1, i32 1, i32 1, i32 1>
+// CHECK:   [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <4 x i32> [[TMP4]]
 uint32x4_t test_vsraq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsraq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsraq_n_u64
-// CHECK: vsra.u64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSRA_N:%.*]] = lshr <2 x i64> [[TMP3]], <i64 1, i64 1>
+// CHECK:   [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[VSRA_N]]
+// CHECK:   ret <2 x i64> [[TMP4]]
 uint64x2_t test_vsraq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsraq_n_u64(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vsri_n_s8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 int8x8_t test_vsri_n_s8(int8x8_t a, int8x8_t b) {
   return vsri_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 int16x4_t test_vsri_n_s16(int16x4_t a, int16x4_t b) {
   return vsri_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 int32x2_t test_vsri_n_s32(int32x2_t a, int32x2_t b) {
   return vsri_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_s64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 int64x1_t test_vsri_n_s64(int64x1_t a, int64x1_t b) {
   return vsri_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 uint8x8_t test_vsri_n_u8(uint8x8_t a, uint8x8_t b) {
   return vsri_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 uint16x4_t test_vsri_n_u16(uint16x4_t a, uint16x4_t b) {
   return vsri_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u32
-// CHECK: vsri.32 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsri_n_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i32> @llvm.arm.neon.vshiftins.v2i32(<2 x i32> [[VSLI_N]], <2 x i32> [[VSLI_N1]], <2 x i32> <i32 -1, i32 -1>)
+// CHECK:   ret <2 x i32> [[VSLI_N2]]
 uint32x2_t test_vsri_n_u32(uint32x2_t a, uint32x2_t b) {
   return vsri_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_u64
-// CHECK: vsri.64 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsri_n_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <1 x i64> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <1 x i64> @llvm.arm.neon.vshiftins.v1i64(<1 x i64> [[VSLI_N]], <1 x i64> [[VSLI_N1]], <1 x i64> <i64 -1>)
+// CHECK:   ret <1 x i64> [[VSLI_N2]]
 uint64x1_t test_vsri_n_u64(uint64x1_t a, uint64x1_t b) {
   return vsri_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p8
-// CHECK: vsri.8 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsri_n_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <8 x i8> @llvm.arm.neon.vshiftins.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <8 x i8> [[VSLI_N]]
 poly8x8_t test_vsri_n_p8(poly8x8_t a, poly8x8_t b) {
   return vsri_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsri_n_p16
-// CHECK: vsri.16 d{{[0-9]+}}, d{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsri_n_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i16> @llvm.arm.neon.vshiftins.v4i16(<4 x i16> [[VSLI_N]], <4 x i16> [[VSLI_N1]], <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <4 x i16> [[VSLI_N2]]
 poly16x4_t test_vsri_n_p16(poly16x4_t a, poly16x4_t b) {
   return vsri_n_p16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 int8x16_t test_vsriq_n_s8(int8x16_t a, int8x16_t b) {
   return vsriq_n_s8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 int16x8_t test_vsriq_n_s16(int16x8_t a, int16x8_t b) {
   return vsriq_n_s16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 int32x4_t test_vsriq_n_s32(int32x4_t a, int32x4_t b) {
   return vsriq_n_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_s64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 int64x2_t test_vsriq_n_s64(int64x2_t a, int64x2_t b) {
   return vsriq_n_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 uint8x16_t test_vsriq_n_u8(uint8x16_t a, uint8x16_t b) {
   return vsriq_n_u8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 uint16x8_t test_vsriq_n_u16(uint16x8_t a, uint16x8_t b) {
   return vsriq_n_u16(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u32
-// CHECK: vsri.32 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsriq_n_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSLI_N2:%.*]] = call <4 x i32> @llvm.arm.neon.vshiftins.v4i32(<4 x i32> [[VSLI_N]], <4 x i32> [[VSLI_N1]], <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+// CHECK:   ret <4 x i32> [[VSLI_N2]]
 uint32x4_t test_vsriq_n_u32(uint32x4_t a, uint32x4_t b) {
   return vsriq_n_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_u64
-// CHECK: vsri.64 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsriq_n_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSLI_N2:%.*]] = call <2 x i64> @llvm.arm.neon.vshiftins.v2i64(<2 x i64> [[VSLI_N]], <2 x i64> [[VSLI_N1]], <2 x i64> <i64 -1, i64 -1>)
+// CHECK:   ret <2 x i64> [[VSLI_N2]]
 uint64x2_t test_vsriq_n_u64(uint64x2_t a, uint64x2_t b) {
   return vsriq_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p8
-// CHECK: vsri.8 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsriq_n_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[VSLI_N:%.*]] = call <16 x i8> @llvm.arm.neon.vshiftins.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+// CHECK:   ret <16 x i8> [[VSLI_N]]
 poly8x16_t test_vsriq_n_p8(poly8x16_t a, poly8x16_t b) {
   return vsriq_n_p8(a, b, 1);
 }
 
-// CHECK-LABEL: test_vsriq_n_p16
-// CHECK: vsri.16 q{{[0-9]+}}, q{{[0-9]+}}, #{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsriq_n_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[VSLI_N:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[VSLI_N1:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSLI_N2:%.*]] = call <8 x i16> @llvm.arm.neon.vshiftins.v8i16(<8 x i16> [[VSLI_N]], <8 x i16> [[VSLI_N1]], <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+// CHECK:   ret <8 x i16> [[VSLI_N2]]
 poly16x8_t test_vsriq_n_p16(poly16x8_t a, poly16x8_t b) {
   return vsriq_n_p16(a, b, 1);
 }
 
 
-// CHECK-LABEL: test_vst1q_u8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_u8(uint8_t * a, uint8x16_t b) {
   vst1q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_u16(uint16_t * a, uint16x8_t b) {
   vst1q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_u32(uint32_t * a, uint32x4_t b) {
   vst1q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_u64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1q_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_u64(uint64_t * a, uint64x2_t b) {
   vst1q_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_s8(int8_t * a, int8x16_t b) {
   vst1q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_s16(int16_t * a, int16x8_t b) {
   vst1q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i32(i8* [[TMP0]], <4 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_s32(int32_t * a, int32x4_t b) {
   vst1q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_s64
-// CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1q_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i64(i8* [[TMP0]], <2 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_s64(int64_t * a, int64x2_t b) {
   vst1q_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_f16(float16_t * a, float16x8_t b) {
   vst1q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_f32
-// CHECK: vst1.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f32(i8* [[TMP0]], <4 x float> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1q_f32(float32_t * a, float32x4_t b) {
   vst1q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p8
-// CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v16i8(i8* %a, <16 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1q_p8(poly8_t * a, poly8x16_t b) {
   vst1q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1q_p16
-// CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1q_p16(poly16_t * a, poly16x8_t b) {
   vst1q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_u8(uint8_t * a, uint8x8_t b) {
   vst1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_u16(uint16_t * a, uint16x4_t b) {
   vst1_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_u32(uint32_t * a, uint32x2_t b) {
   vst1_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_u64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_u64(uint64_t * a, uint64x1_t b) {
   vst1_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_s8(int8_t * a, int8x8_t b) {
   vst1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_s16(int16_t * a, int16x4_t b) {
   vst1_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2i32(i8* [[TMP0]], <2 x i32> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_s32(int32_t * a, int32x2_t b) {
   vst1_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_s64
-// CHECK: vst1.64 {d{{[0-9]+}}}, [r{{[0-9]+}}{{(:64)?}}]
+// CHECK-LABEL: define void @test_vst1_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_s64(int64_t * a, int64x1_t b) {
   vst1_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_f16(float16_t * a, float16x4_t b) {
   vst1_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst1_f32
-// CHECK: vst1.32 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v2f32(i8* [[TMP0]], <2 x float> [[TMP2]], i32 4)
+// CHECK:   ret void
 void test_vst1_f32(float32_t * a, float32x2_t b) {
   vst1_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p8
-// CHECK: vst1.8 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i8(i8* %a, <8 x i8> %b, i32 1)
+// CHECK:   ret void
 void test_vst1_p8(poly8_t * a, poly8x8_t b) {
   vst1_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst1_p16
-// CHECK: vst1.16 {d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   ret void
 void test_vst1_p16(poly16_t * a, poly16x4_t b) {
   vst1_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst1q_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_u8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_u8(uint8_t * a, uint8x16_t b) {
   vst1q_lane_u8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_u16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_u16(uint16_t * a, uint16x8_t b) {
   vst1q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_u32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_u32(uint32_t * a, uint32x4_t b) {
   vst1q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1q_lane_u64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
+// CHECK:   ret void
 void test_vst1q_lane_u64(uint64_t * a, uint64x2_t b) {
   vst1q_lane_u64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_s8(int8_t * a, int8x16_t b) {
   vst1q_lane_s8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_s16(int16_t * a, int16x8_t b) {
   vst1q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_s32(int32_t * a, int32x4_t b) {
   vst1q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <1 x i32> <i32 1>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v1i64(i8* [[TMP0]], <1 x i64> [[TMP3]], i32 4)
+// CHECK:   ret void
 void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
   vst1q_lane_s64(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_f16(half* %a, <8 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
   vst1q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1q_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1q_lane_f32(float32_t * a, float32x4_t b) {
   vst1q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1q_lane_p8(i8* %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <16 x i8> %b, i32 15
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1q_lane_p8(poly8_t * a, poly8x16_t b) {
   vst1q_lane_p8(a, b, 15);
 }
 
-// CHECK-LABEL: test_vst1q_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1q_lane_p16(i16* %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1q_lane_p16(poly16_t * a, poly16x8_t b) {
   vst1q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_u8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_u8(uint8_t * a, uint8x8_t b) {
   vst1_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_u16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_u16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_u16(uint16_t * a, uint16x4_t b) {
   vst1_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_u32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_u32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_u32(uint32_t * a, uint32x2_t b) {
   vst1_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_u64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1_lane_u64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_u64(uint64_t * a, uint64x1_t b) {
   vst1_lane_u64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_s8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_s8(int8_t * a, int8x8_t b) {
   vst1_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_s16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_s16(int16_t * a, int16x4_t b) {
   vst1_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_s32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i32*
+// CHECK:   store i32 [[TMP3]], i32* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_s32(int32_t * a, int32x2_t b) {
   vst1_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_s64
-// CHECK: {{str|vstr|vmov}}
+// CHECK-LABEL: define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <1 x i64> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64>
+// CHECK:   [[TMP3:%.*]] = extractelement <1 x i64> [[TMP2]], i32 0
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i64*
+// CHECK:   store i64 [[TMP3]], i64* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
   vst1_lane_s64(a, b, 0);
 }
 
-// CHECK-LABEL: test_vst1_lane_f16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_f16(half* %a, <4 x half> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
   vst1_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst1_lane_f32
-// CHECK: vst1.32 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:32]
+// CHECK-LABEL: define void @test_vst1_lane_f32(float* %a, <2 x float> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to float*
+// CHECK:   store float [[TMP3]], float* [[TMP4]], align 4
+// CHECK:   ret void
 void test_vst1_lane_f32(float32_t * a, float32x2_t b) {
   vst1_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst1_lane_p8
-// CHECK: vst1.8 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst1_lane_p8(i8* %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = extractelement <8 x i8> %b, i32 7
+// CHECK:   store i8 [[TMP0]], i8* %a, align 1
+// CHECK:   ret void
 void test_vst1_lane_p8(poly8_t * a, poly8x8_t b) {
   vst1_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst1_lane_p16
-// CHECK: vst1.16 {d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}:16]
+// CHECK-LABEL: define void @test_vst1_lane_p16(i16* %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
+// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   ret void
 void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
   vst1_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst2q_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
   vst2q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
   vst2q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
   vst2q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
   vst2q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
   vst2q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
   vst2q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
   vst2q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2q_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2q_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_u32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst2_u64(i64* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x2_t, %struct.uint64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_u64(uint64_t * a, uint64x1x2_t b) {
   vst2_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_s8(int8_t * a, int8x8x2_t b) {
   vst2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_s16(int16_t * a, int16x4x2_t b) {
   vst2_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_s32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_s32(int32_t * a, int32x2x2_t b) {
   vst2_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst2_s64(i64* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <1 x i64>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x2_t, %struct.int64x1x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <1 x i64>], [2 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP8]], <1 x i64> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_s64(int64_t * a, int64x1x2_t b) {
   vst2_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_f16(half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_f16(float16_t * a, float16x4x2_t b) {
   vst2_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst2_f32
-// CHECK: vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_f32(float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 4)
+// CHECK:   ret void
 void test_vst2_f32(float32_t * a, float32x2x2_t b) {
   vst2_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p8
-// CHECK: vst2.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_p8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 1)
+// CHECK:   ret void
 void test_vst2_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst2_p16
-// CHECK: vst2.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_p16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   ret void
 void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst2q_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
   vst2q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
   vst2q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
   vst2q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
   vst2q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
   vst2q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2q_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP8]], <4 x float> [[TMP9]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
   vst2q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2q_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2q_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst2q_lane_p16(poly16_t * a, poly16x8x2_t b) {
   vst2q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_u8(uint8_t * a, uint8x8x2_t b) {
   vst2_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_u16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x2_t, %struct.uint16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_u16(uint16_t * a, uint16x4x2_t b) {
   vst2_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_u32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_u32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x2_t, %struct.uint32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_u32(uint32_t * a, uint32x2x2_t b) {
   vst2_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_s8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_s8(int8_t * a, int8x8x2_t b) {
   vst2_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_s16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x2_t, %struct.int16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_s16(int16_t * a, int16x4x2_t b) {
   vst2_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_s32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_s32(i32* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x i32>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x2_t, %struct.int32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x i32>], [2 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
   vst2_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_f16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_f16(half* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x half>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x2_t, %struct.float16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
   vst2_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst2_lane_f32
-// CHECK: vst2.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_f32(float* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <2 x float>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x2_t, %struct.float32x2x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <2 x float>], [2 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP9]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst2_lane_f32(float32_t * a, float32x2x2_t b) {
   vst2_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst2_lane_p8
-// CHECK: vst2.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_p8(i8* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst2_lane_p8(poly8_t * a, poly8x8x2_t b) {
   vst2_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst2_lane_p16
-// CHECK: vst2.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst2_lane_p16(i16* %a, [2 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i16>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x2_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x2_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 16, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x2_t, %struct.poly16x4x2_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i16>], [2 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
   vst2_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst3q_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
   vst3q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_u32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
   vst3q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
   vst3q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_s32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
   vst3q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_f16(half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
   vst3q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_f32(float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
   vst3q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_p8(i8* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
   vst3q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3q_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst3q_p16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3q_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_u32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst3_u64(i64* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x3_t, %struct.uint64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_u64(uint64_t * a, uint64x1x3_t b) {
   vst3_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_s8(int8_t * a, int8x8x3_t b) {
   vst3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_s16(int16_t * a, int16x4x3_t b) {
   vst3_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_s32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_s32(int32_t * a, int32x2x3_t b) {
   vst3_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst3_s64(i64* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <1 x i64>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x3_t, %struct.int64x1x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <1 x i64>], [3 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP10]], <1 x i64> [[TMP11]], <1 x i64> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_s64(int64_t * a, int64x1x3_t b) {
   vst3_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_f16(half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_f16(float16_t * a, float16x4x3_t b) {
   vst3_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst3_f32
-// CHECK: vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_f32(float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 4)
+// CHECK:   ret void
 void test_vst3_f32(float32_t * a, float32x2x3_t b) {
   vst3_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p8
-// CHECK: vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_p8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 1)
+// CHECK:   ret void
 void test_vst3_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst3_p16
-// CHECK: vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_p16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   ret void
 void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst3q_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_u16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
   vst3q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_u32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
   vst3q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_s16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
   vst3q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_s32(i32* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
   vst3q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_f16(half* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
   vst3q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3q_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_f32(float* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP10]], <4 x float> [[TMP11]], <4 x float> [[TMP12]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
   vst3q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3q_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst3q_lane_p16(i16* %a, [6 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst3q_lane_p16(poly16_t * a, poly16x8x3_t b) {
   vst3q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_u8(uint8_t * a, uint8x8x3_t b) {
   vst3_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_u16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x3_t, %struct.uint16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_u16(uint16_t * a, uint16x4x3_t b) {
   vst3_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_u32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_u32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x3_t, %struct.uint32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_u32(uint32_t * a, uint32x2x3_t b) {
   vst3_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_s8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_s8(int8_t * a, int8x8x3_t b) {
   vst3_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_s16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x3_t, %struct.int16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_s16(int16_t * a, int16x4x3_t b) {
   vst3_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_s32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_s32(i32* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x i32>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x3_t, %struct.int32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x i32>], [3 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
   vst3_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_f16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_f16(half* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x half>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x3_t, %struct.float16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
   vst3_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst3_lane_f32
-// CHECK: vst3.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_f32(float* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <2 x float>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x3_t, %struct.float32x2x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <2 x float>], [3 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP10]], <2 x float> [[TMP11]], <2 x float> [[TMP12]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst3_lane_f32(float32_t * a, float32x2x3_t b) {
   vst3_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst3_lane_p8
-// CHECK: vst3.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_p8(i8* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst3_lane_p8(poly8_t * a, poly8x8x3_t b) {
   vst3_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst3_lane_p16
-// CHECK: vst3.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst3_lane_p16(i16* %a, [3 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i16>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x3_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x3_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 24, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x3_t, %struct.poly16x4x3_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i16>], [3 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
   vst3_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vst4q_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
   vst4q_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_u32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
   vst4q_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
   vst4q_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_s32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
   vst4q_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_f16(half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
   vst4q_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_f32(float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
   vst4q_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_p8(i8* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
   vst4q_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4q_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
+// CHECK-LABEL: define void @test_vst4q_p16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4q_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_p16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_u16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_u32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_u64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst4_u64(i64* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint64x1x4_t, %struct.uint64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_u64(uint64_t * a, uint64x1x4_t b) {
   vst4_u64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_s8(int8_t * a, int8x8x4_t b) {
   vst4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_s16(int16_t * a, int16x4x4_t b) {
   vst4_s16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_s32(int32_t * a, int32x2x4_t b) {
   vst4_s32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_s64
-// CHECK: vst1.64
+// CHECK-LABEL: define void @test_vst4_s64(i64* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int64x1x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <1 x i64>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int64x1x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int64x1x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i64* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <1 x i64> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <1 x i64> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int64x1x4_t, %struct.int64x1x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <1 x i64>], [4 x <1 x i64>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <1 x i64>, <1 x i64>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <1 x i64>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <1 x i64>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <1 x i64>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v1i64(i8* [[TMP3]], <1 x i64> [[TMP12]], <1 x i64> [[TMP13]], <1 x i64> [[TMP14]], <1 x i64> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_s64(int64_t * a, int64x1x4_t b) {
   vst4_s64(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_f16(float16_t * a, float16x4x4_t b) {
   vst4_f16(a, b);
 }
 
-// CHECK-LABEL: test_vst4_f32
-// CHECK: vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 4)
+// CHECK:   ret void
 void test_vst4_f32(float32_t * a, float32x2x4_t b) {
   vst4_f32(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p8
-// CHECK: vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 1)
+// CHECK:   ret void
 void test_vst4_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_p8(a, b);
 }
 
-// CHECK-LABEL: test_vst4_p16
-// CHECK: vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   ret void
 void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vst4q_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_u16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
   vst4q_lane_u16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_u32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
   vst4q_lane_u32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_s16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
   vst4q_lane_s16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_s32(i32* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i32(i8* [[TMP3]], <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> [[TMP14]], <4 x i32> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
   vst4q_lane_s32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_f16(half* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
   vst4q_lane_f16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4q_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_f32(float* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <4 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <4 x float>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f32(i8* [[TMP3]], <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x float> [[TMP15]], i32 3, i32 4)
+// CHECK:   ret void
 void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
   vst4q_lane_f32(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4q_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}
+// CHECK-LABEL: define void @test_vst4q_lane_p16(i16* %a, [8 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   ret void
 void test_vst4q_lane_p16(poly16_t * a, poly16x8x4_t b) {
   vst4q_lane_p16(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_u8(uint8_t * a, uint8x8x4_t b) {
   vst4_lane_u8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_u16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x4x4_t, %struct.uint16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_u16(uint16_t * a, uint16x4x4_t b) {
   vst4_lane_u16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_u32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_u32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x2x4_t, %struct.uint32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_u32(uint32_t * a, uint32x2x4_t b) {
   vst4_lane_u32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_s8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_s8(int8_t * a, int8x8x4_t b) {
   vst4_lane_s8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_s16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x4x4_t, %struct.int16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_s16(int16_t * a, int16x4x4_t b) {
   vst4_lane_s16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_s32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_s32(i32* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x i32> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x i32> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x2x4_t, %struct.int32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x i32>], [4 x <2 x i32>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x i32>, <2 x i32>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x i32> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x i32>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x i32>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x i32>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x i32>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2i32(i8* [[TMP3]], <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <2 x i32> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
   vst4_lane_s32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_f16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_f16(half* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x half>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x half> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x4x4_t, %struct.float16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
   vst4_lane_f16(a, b, 3);
 }
 
-// CHECK-LABEL: test_vst4_lane_f32
-// CHECK: vst4.32 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_f32(float* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x2x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <2 x float>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x2x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x2x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <2 x float> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <2 x float> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <2 x float> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x2x4_t, %struct.float32x2x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <2 x float>], [4 x <2 x float>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <2 x float> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <2 x float>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <2 x float>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <2 x float>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <2 x float>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v2f32(i8* [[TMP3]], <2 x float> [[TMP12]], <2 x float> [[TMP13]], <2 x float> [[TMP14]], <2 x float> [[TMP15]], i32 1, i32 4)
+// CHECK:   ret void
 void test_vst4_lane_f32(float32_t * a, float32x2x4_t b) {
   vst4_lane_f32(a, b, 1);
 }
 
-// CHECK-LABEL: test_vst4_lane_p8
-// CHECK: vst4.8 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_p8(i8* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x8x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x8x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX]], align 8
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2]], align 8
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4]], align 8
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6]], align 8
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i8(i8* %a, <8 x i8> [[TMP3]], <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], i32 7, i32 1)
+// CHECK:   ret void
 void test_vst4_lane_p8(poly8_t * a, poly8x8x4_t b) {
   vst4_lane_p8(a, b, 7);
 }
 
-// CHECK-LABEL: test_vst4_lane_p16
-// CHECK: vst4.16 {d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}], d{{[0-9]+}}[{{[0-9]+}}]}, [r{{[0-9]+}}]
+// CHECK-LABEL: define void @test_vst4_lane_p16(i16* %a, [4 x i64] %b.coerce) #0 {
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x4x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x4x4_t* [[__S1]] to i8*
+// CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x4x4_t* [[B]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
+// CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
+// CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX]], align 8
+// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[TMP4]] to <8 x i8>
+// CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL1]], i32 0, i32 1
+// CHECK:   [[TMP6:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX2]], align 8
+// CHECK:   [[TMP7:%.*]] = bitcast <4 x i16> [[TMP6]] to <8 x i8>
+// CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL3]], i32 0, i32 2
+// CHECK:   [[TMP8:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX4]], align 8
+// CHECK:   [[TMP9:%.*]] = bitcast <4 x i16> [[TMP8]] to <8 x i8>
+// CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x4x4_t, %struct.poly16x4x4_t* [[__S1]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i16>], [4 x <4 x i16>]* [[VAL5]], i32 0, i32 3
+// CHECK:   [[TMP10:%.*]] = load <4 x i16>, <4 x i16>* [[ARRAYIDX6]], align 8
+// CHECK:   [[TMP11:%.*]] = bitcast <4 x i16> [[TMP10]] to <8 x i8>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   ret void
 void test_vst4_lane_p16(poly16_t * a, poly16x4x4_t b) {
   vst4_lane_p16(a, b, 3);
 }
 
 
-// CHECK-LABEL: test_vsub_s8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsub_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 int8x8_t test_vsub_s8(int8x8_t a, int8x8_t b) {
   return vsub_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsub_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 int16x4_t test_vsub_s16(int16x4_t a, int16x4_t b) {
   return vsub_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsub_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 int32x2_t test_vsub_s32(int32x2_t a, int32x2_t b) {
   return vsub_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_s64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsub_s64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 int64x1_t test_vsub_s64(int64x1_t a, int64x1_t b) {
   return vsub_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsub_f32
-// CHECK: vsub.f32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x float> @test_vsub_f32(<2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <2 x float> %a, %b
+// CHECK:   ret <2 x float> [[SUB_I]]
 float32x2_t test_vsub_f32(float32x2_t a, float32x2_t b) {
   return vsub_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u8
-// CHECK: vsub.i8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsub_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i8> %a, %b
+// CHECK:   ret <8 x i8> [[SUB_I]]
 uint8x8_t test_vsub_u8(uint8x8_t a, uint8x8_t b) {
   return vsub_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u16
-// CHECK: vsub.i16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsub_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i16> %a, %b
+// CHECK:   ret <4 x i16> [[SUB_I]]
 uint16x4_t test_vsub_u16(uint16x4_t a, uint16x4_t b) {
   return vsub_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u32
-// CHECK: vsub.i32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsub_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i32> %a, %b
+// CHECK:   ret <2 x i32> [[SUB_I]]
 uint32x2_t test_vsub_u32(uint32x2_t a, uint32x2_t b) {
   return vsub_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsub_u64
-// CHECK: vsub.i64 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <1 x i64> @test_vsub_u64(<1 x i64> %a, <1 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <1 x i64> %a, %b
+// CHECK:   ret <1 x i64> [[SUB_I]]
 uint64x1_t test_vsub_u64(uint64x1_t a, uint64x1_t b) {
   return vsub_u64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsubq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 int8x16_t test_vsubq_s8(int8x16_t a, int8x16_t b) {
   return vsubq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubq_s16(int16x8_t a, int16x8_t b) {
   return vsubq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubq_s32(int32x4_t a, int32x4_t b) {
   return vsubq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_s64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubq_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubq_s64(int64x2_t a, int64x2_t b) {
   return vsubq_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_f32
-// CHECK: vsub.f32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x float> @test_vsubq_f32(<4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = fsub <4 x float> %a, %b
+// CHECK:   ret <4 x float> [[SUB_I]]
 float32x4_t test_vsubq_f32(float32x4_t a, float32x4_t b) {
   return vsubq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u8
-// CHECK: vsub.i8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vsubq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <16 x i8> %a, %b
+// CHECK:   ret <16 x i8> [[SUB_I]]
 uint8x16_t test_vsubq_u8(uint8x16_t a, uint8x16_t b) {
   return vsubq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u16
-// CHECK: vsub.i16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, %b
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubq_u16(uint16x8_t a, uint16x8_t b) {
   return vsubq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u32
-// CHECK: vsub.i32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, %b
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubq_u32(uint32x4_t a, uint32x4_t b) {
   return vsubq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubq_u64
-// CHECK: vsub.i64 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubq_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, %b
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubq_u64(uint64x2_t a, uint64x2_t b) {
   return vsubq_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubhn_s16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 int8x8_t test_vsubhn_s16(int16x8_t a, int16x8_t b) {
   return vsubhn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 int16x4_t test_vsubhn_s32(int32x4_t a, int32x4_t b) {
   return vsubhn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_s64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_s64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 int32x2_t test_vsubhn_s64(int64x2_t a, int64x2_t b) {
   return vsubhn_s64(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u16
-// CHECK: vsubhn.i16 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vsubhn_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <8 x i16> [[VSUBHN_I]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <8 x i16> [[VSUBHN1_I]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VSUBHN2_I]]
 uint8x8_t test_vsubhn_u16(uint16x8_t a, uint16x8_t b) {
   return vsubhn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u32
-// CHECK: vsubhn.i32 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vsubhn_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <4 x i32> [[VSUBHN_I]], <i32 16, i32 16, i32 16, i32 16>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <4 x i32> [[VSUBHN1_I]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VSUBHN2_I]]
 uint16x4_t test_vsubhn_u32(uint32x4_t a, uint32x4_t b) {
   return vsubhn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vsubhn_u64
-// CHECK: vsubhn.i64 d{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vsubhn_u64(<2 x i64> %a, <2 x i64> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i64> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <2 x i64>
+// CHECK:   [[VSUBHN_I:%.*]] = sub <2 x i64> [[TMP2]], [[TMP3]]
+// CHECK:   [[VSUBHN1_I:%.*]] = lshr <2 x i64> [[VSUBHN_I]], <i64 32, i64 32>
+// CHECK:   [[VSUBHN2_I:%.*]] = trunc <2 x i64> [[VSUBHN1_I]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VSUBHN2_I]]
 uint32x2_t test_vsubhn_u64(uint64x2_t a, uint64x2_t b) {
   return vsubhn_u64(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubl_s8
-// CHECK: vsubl.s8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubl_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubl_s8(int8x8_t a, int8x8_t b) {
   return vsubl_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s16
-// CHECK: vsubl.s16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubl_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubl_s16(int16x4_t a, int16x4_t b) {
   return vsubl_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_s32
-// CHECK: vsubl.s32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubl_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = sext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubl_s32(int32x2_t a, int32x2_t b) {
   return vsubl_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u8
-// CHECK: vsubl.u8 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubl_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %a to <8 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubl_u8(uint8x8_t a, uint8x8_t b) {
   return vsubl_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u16
-// CHECK: vsubl.u16 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubl_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubl_u16(uint16x4_t a, uint16x4_t b) {
   return vsubl_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubl_u32
-// CHECK: vsubl.u32 q{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubl_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VMOVL_I4_I:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> [[VMOVL_I_I]], [[VMOVL_I4_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubl_u32(uint32x2_t a, uint32x2_t b) {
   return vsubl_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vsubw_s8
-// CHECK: vsubw.s8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubw_s8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 int16x8_t test_vsubw_s8(int16x8_t a, int8x8_t b) {
   return vsubw_s8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s16
-// CHECK: vsubw.s16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubw_s16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 int32x4_t test_vsubw_s16(int32x4_t a, int16x4_t b) {
   return vsubw_s16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_s32
-// CHECK: vsubw.s32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubw_s32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 int64x2_t test_vsubw_s32(int64x2_t a, int32x2_t b) {
   return vsubw_s32(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u8
-// CHECK: vsubw.u8 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vsubw_u8(<8 x i16> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <8 x i8> %b to <8 x i16>
+// CHECK:   [[SUB_I:%.*]] = sub <8 x i16> %a, [[VMOVL_I_I]]
+// CHECK:   ret <8 x i16> [[SUB_I]]
 uint16x8_t test_vsubw_u8(uint16x8_t a, uint8x8_t b) {
   return vsubw_u8(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u16
-// CHECK: vsubw.u16 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vsubw_u16(<4 x i32> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32>
+// CHECK:   [[SUB_I:%.*]] = sub <4 x i32> %a, [[VMOVL_I_I]]
+// CHECK:   ret <4 x i32> [[SUB_I]]
 uint32x4_t test_vsubw_u16(uint32x4_t a, uint16x4_t b) {
   return vsubw_u16(a, b);
 }
 
-// CHECK-LABEL: test_vsubw_u32
-// CHECK: vsubw.u32 q{{[0-9]+}}, q{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i64> @test_vsubw_u32(<2 x i64> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[VMOVL_I_I:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+// CHECK:   [[SUB_I:%.*]] = sub <2 x i64> %a, [[VMOVL_I_I]]
+// CHECK:   ret <2 x i64> [[SUB_I]]
 uint64x2_t test_vsubw_u32(uint64x2_t a, uint32x2_t b) {
   return vsubw_u32(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl1_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 uint8x8_t test_vtbl1_u8(uint8x8_t a, uint8x8_t b) {
   return vtbl1_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 int8x8_t test_vtbl1_s8(int8x8_t a, int8x8_t b) {
   return vtbl1_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl1_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl1_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[VTBL1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL1_I]]
 poly8x8_t test_vtbl1_p8(poly8x8_t a, uint8x8_t b) {
   return vtbl1_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl2_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_u8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 uint8x8_t test_vtbl2_u8(uint8x8x2_t a, uint8x8_t b) {
   return vtbl2_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_s8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 int8x8_t test_vtbl2_s8(int8x8x2_t a, int8x8_t b) {
   return vtbl2_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl2_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl2_p8([2 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[A]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBL2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL2_I]]
 poly8x8_t test_vtbl2_p8(poly8x8x2_t a, uint8x8_t b) {
   return vtbl2_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl3_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_u8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 uint8x8_t test_vtbl3_u8(uint8x8x3_t a, uint8x8_t b) {
   return vtbl3_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_s8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 int8x8_t test_vtbl3_s8(int8x8x3_t a, int8x8_t b) {
   return vtbl3_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl3_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl3_p8([3 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[A]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBL3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL3_I]]
 poly8x8_t test_vtbl3_p8(poly8x8x3_t a, uint8x8_t b) {
   return vtbl3_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbl4_u8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_u8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 uint8x8_t test_vtbl4_u8(uint8x8x4_t a, uint8x8_t b) {
   return vtbl4_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_s8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_s8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 int8x8_t test_vtbl4_s8(int8x8x4_t a, int8x8_t b) {
   return vtbl4_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtbl4_p8
-// CHECK: vtbl.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbl4_p8([4 x i64] %a.coerce, <8 x i8> %b) #0 {
+// CHECK:   [[__P0_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[A:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[A]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[A]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P0_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBL4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %b) #4
+// CHECK:   ret <8 x i8> [[VTBL4_I]]
 poly8x8_t test_vtbl4_p8(poly8x8x4_t a, uint8x8_t b) {
   return vtbl4_p8(a, b);
 }
 
 
-// CHECK-LABEL: test_vtbx1_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_u8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 uint8x8_t test_vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) {
   return vtbx1_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_s8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 int8x8_t test_vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c) {
   return vtbx1_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx1_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx1_p8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+// CHECK:   [[VTBX1_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX1_I]]
 poly8x8_t test_vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c) {
   return vtbx1_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx2_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_u8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x2_t, %struct.uint8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 uint8x8_t test_vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) {
   return vtbx2_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_s8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x2_t, %struct.int8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 int8x8_t test_vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c) {
   return vtbx2_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx2_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx2_p8(<8 x i8> %a, [2 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[B]].coerce, [2 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE1]] to [2 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [2 x i64], [2 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [2 x <8 x i8>]* [[COERCE_DIVE_I]] to [2 x i64]*
+// CHECK:   store [2 x i64] [[TMP2]], [2 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x2_t, %struct.poly8x8x2_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [2 x <8 x i8>], [2 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VTBX2_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX2_I]]
 poly8x8_t test_vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c) {
   return vtbx2_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx3_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_u8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x3_t, %struct.uint8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 uint8x8_t test_vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) {
   return vtbx3_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_s8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x3_t, %struct.int8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 int8x8_t test_vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c) {
   return vtbx3_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx3_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx3_p8(<8 x i8> %a, [3 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x3_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[B]].coerce, [3 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE1]] to [3 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [3 x i64], [3 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [3 x <8 x i8>]* [[COERCE_DIVE_I]] to [3 x i64]*
+// CHECK:   store [3 x i64] [[TMP2]], [3 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x3_t, %struct.poly8x8x3_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [3 x <8 x i8>], [3 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VTBX3_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx3(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX3_I]]
 poly8x8_t test_vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c) {
   return vtbx3_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtbx4_u8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_u8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.uint8x8x4_t, %struct.uint8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 uint8x8_t test_vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) {
   return vtbx4_u8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_s8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_s8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.int8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.int8x8x4_t, %struct.int8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 int8x8_t test_vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c) {
   return vtbx4_s8(a, b, c);
 }
 
-// CHECK-LABEL: test_vtbx4_p8
-// CHECK: vtbx.8 d{{[0-9]+}}, {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtbx4_p8(<8 x i8> %a, [4 x i64] %b.coerce, <8 x i8> %c) #0 {
+// CHECK:   [[__P1_I:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x8x4_t, align 8
+// CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
+// CHECK:   [[COERCE_DIVE1:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[B]], i32 0, i32 0
+// CHECK:   [[TMP1:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE1]] to [4 x i64]*
+// CHECK:   [[TMP2:%.*]] = load [4 x i64], [4 x i64]* [[TMP1]], align 8
+// CHECK:   [[COERCE_DIVE_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[TMP3:%.*]] = bitcast [4 x <8 x i8>]* [[COERCE_DIVE_I]] to [4 x i64]*
+// CHECK:   store [4 x i64] [[TMP2]], [4 x i64]* [[TMP3]], align 8
+// CHECK:   [[VAL_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL_I]], i32 0, i32 0
+// CHECK:   [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX_I]], align 8
+// CHECK:   [[VAL1_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX2_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL1_I]], i32 0, i32 1
+// CHECK:   [[TMP5:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX2_I]], align 8
+// CHECK:   [[VAL3_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX4_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL3_I]], i32 0, i32 2
+// CHECK:   [[TMP6:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX4_I]], align 8
+// CHECK:   [[VAL5_I:%.*]] = getelementptr inbounds %struct.poly8x8x4_t, %struct.poly8x8x4_t* [[__P1_I]], i32 0, i32 0
+// CHECK:   [[ARRAYIDX6_I:%.*]] = getelementptr inbounds [4 x <8 x i8>], [4 x <8 x i8>]* [[VAL5_I]], i32 0, i32 3
+// CHECK:   [[TMP7:%.*]] = load <8 x i8>, <8 x i8>* [[ARRAYIDX6_I]], align 8
+// CHECK:   [[VTBX4_I:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %a, <8 x i8> [[TMP4]], <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <8 x i8> %c) #4
+// CHECK:   ret <8 x i8> [[VTBX4_I]]
 poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) {
   return vtbx4_p8(a, b, c);
 }
 
 
-// CHECK-LABEL: test_vtrn_s8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) {
   return vtrn_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) {
   return vtrn_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_s32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) {
   return vtrn_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
   return vtrn_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) {
   return vtrn_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_u32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VTRN_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VTRN1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) {
   return vtrn_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_f32
-// CHECK: vtrn.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VTRN_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VTRN1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) {
   return vtrn_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p8
-// CHECK: vtrn.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i8> [[VTRN_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VTRN1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) {
   return vtrn_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrn_p16
-// CHECK: vtrn.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrn_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i16> [[VTRN_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VTRN1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
   return vtrn_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_s32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_u32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x i32> [[VTRN_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_f32
-// CHECK: vtrn.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// CHECK:   store <4 x float> [[VTRN_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p8
-// CHECK: vtrn.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// CHECK:   store <16 x i8> [[VTRN_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtrnq_p16
-// CHECK: vtrn.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vtrnq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VTRN_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// CHECK:   store <8 x i16> [[VTRN_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VTRN1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vtst_s8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_s8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_s8(int8x8_t a, int8x8_t b) {
   return vtst_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_s16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_s16(int16x4_t a, int16x4_t b) {
   return vtst_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_s32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vtst_s32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_s32(int32x2_t a, int32x2_t b) {
   return vtst_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_u8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_u8(uint8x8_t a, uint8x8_t b) {
   return vtst_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_u16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_u16(uint16x4_t a, uint16x4_t b) {
   return vtst_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtst_u32
-// CHECK: vtst.32 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <2 x i32> @test_vtst_u32(<2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP4:%.*]] = and <2 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <2 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <2 x i1> [[TMP5]] to <2 x i32>
+// CHECK:   ret <2 x i32> [[VTST_I]]
 uint32x2_t test_vtst_u32(uint32x2_t a, uint32x2_t b) {
   return vtst_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p8
-// CHECK: vtst.8 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <8 x i8> @test_vtst_p8(<8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <8 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <8 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i8>
+// CHECK:   ret <8 x i8> [[VTST_I]]
 uint8x8_t test_vtst_p8(poly8x8_t a, poly8x8_t b) {
   return vtst_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtst_p16
-// CHECK: vtst.16 d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define <4 x i16> @test_vtst_p16(<4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP4:%.*]] = and <4 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i16>
+// CHECK:   ret <4 x i16> [[VTST_I]]
 uint16x4_t test_vtst_p16(poly16x4_t a, poly16x4_t b) {
   return vtst_p16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_s8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_s8(int8x16_t a, int8x16_t b) {
   return vtstq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_s16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_s16(int16x8_t a, int16x8_t b) {
   return vtstq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_s32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vtstq_s32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_s32(int32x4_t a, int32x4_t b) {
   return vtstq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_u8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_u8(uint8x16_t a, uint8x16_t b) {
   return vtstq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_u16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_u16(uint16x8_t a, uint16x8_t b) {
   return vtstq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_u32
-// CHECK: vtst.32 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <4 x i32> @test_vtstq_u32(<4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP4:%.*]] = and <4 x i32> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32>
+// CHECK:   ret <4 x i32> [[VTST_I]]
 uint32x4_t test_vtstq_u32(uint32x4_t a, uint32x4_t b) {
   return vtstq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p8
-// CHECK: vtst.8 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <16 x i8> @test_vtstq_p8(<16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = and <16 x i8> %a, %b
+// CHECK:   [[TMP1:%.*]] = icmp ne <16 x i8> [[TMP0]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i8>
+// CHECK:   ret <16 x i8> [[VTST_I]]
 uint8x16_t test_vtstq_p8(poly8x16_t a, poly8x16_t b) {
   return vtstq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vtstq_p16
-// CHECK: vtst.16 q{{[0-9]+}}, q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define <8 x i16> @test_vtstq_p16(<8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16>
+// CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP4:%.*]] = and <8 x i16> [[TMP2]], [[TMP3]]
+// CHECK:   [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
+// CHECK:   [[VTST_I:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+// CHECK:   ret <8 x i16> [[VTST_I]]
 uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) {
   return vtstq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vuzp_s8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) {
   return vuzp_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) {
   return vuzp_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_s32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) {
   return vuzp_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) {
   return vuzp_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) {
   return vuzp_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_u32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VUZP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VUZP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) {
   return vuzp_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_f32
-// CHECK: {{vtrn|vuzp}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VUZP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VUZP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) {
   return vuzp_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p8
-// CHECK: vuzp.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i8> [[VUZP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i8> [[VUZP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) {
   return vuzp_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzp_p16
-// CHECK: vuzp.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzp_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i16> [[VUZP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i16> [[VUZP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
   return vuzp_p16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_s32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_u32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x i32> [[VUZP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_f32
-// CHECK: {{vtrn|vuzp}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+// CHECK:   store <4 x float> [[VUZP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+// CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p8
-// CHECK: vuzp.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+// CHECK:   store <16 x i8> [[VUZP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+// CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vuzpq_p16
-// CHECK: vuzp.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vuzpq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VUZP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+// CHECK:   store <8 x i16> [[VUZP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VUZP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+// CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
 }
 
 
-// CHECK-LABEL: test_vzip_s8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_s8(%struct.int8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) {
   return vzip_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_s16(%struct.int16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) {
   return vzip_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_s32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_s32(%struct.int32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) {
   return vzip_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_u8(%struct.uint8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) {
   return vzip_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_u16(%struct.uint16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) {
   return vzip_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzip_u32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_u32(%struct.uint32x2x2_t* noalias sret %agg.result, <2 x i32> %a, <2 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x i32> [[VZIP_I]], <2 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x i32>, <2 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x i32> [[VZIP1_I]], <2 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) {
   return vzip_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_f32
-// CHECK: {{vtrn|vzip}}.32 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_f32(%struct.float32x2x2_t* noalias sret %agg.result, <2 x float> %a, <2 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x2x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <2 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 0, i32 2>
+// CHECK:   store <2 x float> [[VZIP_I]], <2 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <2 x float>, <2 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> <i32 1, i32 3>
+// CHECK:   store <2 x float> [[VZIP1_I]], <2 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x2x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x2x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) {
   return vzip_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p8
-// CHECK: vzip.8 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_p8(%struct.poly8x8x2_t* noalias sret %agg.result, <8 x i8> %a, <8 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x8x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i8> [[VZIP_I]], <8 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <8 x i8>, <8 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i8> [[VZIP1_I]], <8 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) {
   return vzip_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzip_p16
-// CHECK: vzip.16 d{{[0-9]+}}, d{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzip_p16(%struct.poly16x4x2_t* noalias sret %agg.result, <4 x i16> %a, <4 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x4x2_t, align 8
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <8 x i8> [[TMP2]] to <4 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i16> [[VZIP_I]], <4 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i16>, <4 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i16> [[TMP4]], <4 x i16> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i16> [[VZIP1_I]], <4 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 16, i32 8, i1 false) #4
+// CHECK:   ret void
 poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
   return vzip_p16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_s8(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_s16(%struct.int16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_s32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_s32(%struct.int32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_u8(%struct.uint8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_u16(%struct.uint16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_u32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_u32(%struct.uint32x4x2_t* noalias sret %agg.result, <4 x i32> %a, <4 x i32> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x i32>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x i32> [[VZIP_I]], <4 x i32>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_f32
-// CHECK: {{vtrn|vzip}}.32 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_f32(%struct.float32x4x2_t* noalias sret %agg.result, <4 x float> %a, <4 x float> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <4 x float>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+// CHECK:   store <4 x float> [[VZIP_I]], <4 x float>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+// CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p8
-// CHECK: vzip.8 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_p8(%struct.poly8x16x2_t* noalias sret %agg.result, <16 x i8> %a, <16 x i8> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+// CHECK:   store <16 x i8> [[VZIP_I]], <16 x i8>* [[TMP1]]
+// CHECK:   [[TMP2:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[TMP1]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+// CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]]
+// CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
+// CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
-// CHECK-LABEL: test_vzipq_p16
-// CHECK: vzip.16 q{{[0-9]+}}, q{{[0-9]+}}
+// CHECK-LABEL: define void @test_vzipq_p16(%struct.poly16x8x2_t* noalias sret %agg.result, <8 x i16> %a, <8 x i16> %b) #0 {
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to <8 x i16>*
+// CHECK:   [[TMP4:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
+// CHECK:   [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
+// CHECK:   [[VZIP_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+// CHECK:   store <8 x i16> [[VZIP_I]], <8 x i16>* [[TMP3]]
+// CHECK:   [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[TMP3]], i32 1
+// CHECK:   [[VZIP1_I:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP5]], <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+// CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP6]]
+// CHECK:   [[TMP7:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
+// CHECK:   [[TMP8:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP7]], i8* [[TMP8]], i32 32, i32 16, i1 false) #4
+// CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
 }
diff --git a/test/CodeGen/asm-errors.c b/test/CodeGen/asm-errors.c
index 05751a8e690a2..a959896df1ed6 100644
--- a/test/CodeGen/asm-errors.c
+++ b/test/CodeGen/asm-errors.c
@@ -1,13 +1,12 @@
 // REQUIRES: x86-registered-target
 
-// RUN: true
-// UN: not %clang_cc1 -triple i386-apple-darwin10 -emit-obj %s -o /dev/null > %t 2>&1
-// UN: FileCheck %s < %t
-// RUN: %clang_cc1 -triple i386-apple-darwin10 -emit-llvm-bc %s -o %t.bc
-// RUN: %clang_cc1 -triple i386-apple-darwin10 -emit-obj %t.bc -o /dev/null 2>&1 | \
+// RUN: not %clang_cc1 -triple i386-apple-darwin10 -emit-obj %s -o /dev/null > %t 2>&1
+// RUN: FileCheck %s < %t
+// RUN: not %clang -target i386-apple-darwin10 -fembed-bitcode -c %s -o /dev/null 2>&1 | \
 // RUN:   FileCheck --check-prefix=CRASH-REPORT %s
 // CRASH-REPORT: <inline asm>:
 // CRASH-REPORT: error: invalid instruction mnemonic 'abc'
+// CRASH-REPORT: error: cannot compile inline asm
 // CRASH-REPORT-NOT: note: diagnostic msg:
 
 int test1(int X) {
diff --git a/test/CodeGen/atomics-inlining.c b/test/CodeGen/atomics-inlining.c
index 23a79a2d28f9e..4974f22e3a476 100644
--- a/test/CodeGen/atomics-inlining.c
+++ b/test/CodeGen/atomics-inlining.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple powerpc64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=PPC64
 // RUN: %clang_cc1 -triple mipsel-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS32
 // RUN: %clang_cc1 -triple mips64el-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
+// RUN: %clang_cc1 -triple sparc-unknown-eabi -emit-llvm %s -o - | FileCheck %s -check-prefix=SPARC
 
 unsigned char c1, c2;
 unsigned short s1, s2;
@@ -90,4 +91,16 @@ void test1(void) {
 // MIPS64: store atomic i64 {{.*}}, i64* @ll1 seq_cst
 // MIPS64: call void @__atomic_load(i64 zeroext 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0)
 // MIPS64: call void @__atomic_store(i64 zeroext 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+
+// SPARC-LABEL: define void @test1
+// SPARC: = load atomic i8, i8* @c1 seq_cst
+// SPARC: store atomic i8 {{.*}}, i8* @c1 seq_cst
+// SPARC: = load atomic i16, i16* @s1 seq_cst
+// SPARC: store atomic i16 {{.*}}, i16* @s1 seq_cst
+// SPARC: = load atomic i32, i32* @i1 seq_cst
+// SPARC: store atomic i32 {{.*}}, i32* @i1 seq_cst
+// SPARC: = load atomic i64, i64* @ll1 seq_cst
+// SPARC: store atomic i64 {{.*}}, i64* @ll1 seq_cst
+// SPARC: call void @__atomic_load(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
+// SPARC: call void @__atomic_store(i32 100, i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a1, i32 0, i32 0), i8* getelementptr inbounds ([100 x i8], [100 x i8]* @a2, i32 0, i32 0)
 }
diff --git a/test/CodeGen/attr-func-def.c b/test/CodeGen/attr-func-def.c
index ceafa1220f1fe..a295488baa0dd 100644
--- a/test/CodeGen/attr-func-def.c
+++ b/test/CodeGen/attr-func-def.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.10.0 -emit-llvm -Oz -o - %s | FileCheck %s
 
-// CHECK: define i32 @foo2(i32 %a) [[ATTRS2:#[0-9]+]] {
-// CHECK: define i32 @foo1(i32 %a) [[ATTRS1:#[0-9]+]] {
+// CHECK: define i32 @foo2(i32 %a) local_unnamed_addr [[ATTRS2:#[0-9]+]] {
+// CHECK: define i32 @foo1(i32 %a) local_unnamed_addr [[ATTRS1:#[0-9]+]] {
 
 int foo1(int);
 
diff --git a/test/CodeGen/attr-mode-enums.c b/test/CodeGen/attr-mode-enums.c
new file mode 100644
index 0000000000000..4675f6c069cd5
--- /dev/null
+++ b/test/CodeGen/attr-mode-enums.c
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+// Test checks that 'mode' attribute is handled correctly with enums, i. e. code
+//   1. "typedef enum { A } __attribute__((mode(HI))) T;" is accepted,
+//   2. "enum X __attribute__((mode(QI))) var;" forms a complete integer type.
+
+int main() {
+  // CHECK: [[X1:%.+]] = alloca i8
+  enum { A1, B1 } __attribute__((mode(QI))) x1 = A1;
+
+  // CHECK: [[X2:%.+]] = alloca i16
+  enum { A2, B2 } x2 __attribute__((mode(HI))) = B2;
+
+  // CHECK: [[X3:%.+]] = alloca i32
+  typedef enum { A3, B3 } __attribute__((mode(SI))) T3;
+  T3 x3 = A3;
+
+  // CHECK: [[X4:%.+]] = alloca i64
+  typedef enum { A4, B4 } T4 __attribute__((mode(DI)));
+  T4 x4 = B4;
+
+  // CHECK: [[X5:%.+]] = alloca i8
+  typedef enum __attribute__((mode(QI))) { A5, B5 } T5;
+  T5 x5 = A5;
+
+  // CHECK: [[X6:%.+]] = alloca i8
+  typedef enum X __attribute__((mode(QI))) T6;
+  T6 x6;
+
+  // CHECK: [[X7:%.+]] = alloca i128
+  enum { A7, B7 } __attribute__((mode(TI))) x7 = A7;
+
+  // CHECK: [[X8:%.+]] = alloca i8
+  enum __attribute__((mode(QI))) { A8, B8 } x8 = B8;
+
+  // CHECK: store i8 0, i8* [[X1]]
+  // CHECK: store i16 1, i16* [[X2]]
+  // CHECK: store i32 0, i32* [[X3]]
+  // CHECK: store i64 1, i64* [[X4]]
+  // CHECK: store i8 0, i8* [[X5]]
+  // CHECK: store i128 0, i128* [[X7]]
+  // CHECK: store i8 1, i8* [[X8]]
+
+  return x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8;
+}
diff --git a/test/CodeGen/attr-target-x86-mmx.c b/test/CodeGen/attr-target-x86-mmx.c
index 6720c6b7466ca..412e8e93af9db 100644
--- a/test/CodeGen/attr-target-x86-mmx.c
+++ b/test/CodeGen/attr-target-x86-mmx.c
@@ -19,4 +19,4 @@ void __attribute__((target("sse"))) shift(__m64 a, __m64 b, int c) {
   _mm_srai_pi32(a, c);
 }
 
-// CHECK: "target-features"="+mmx,+sse"
+// CHECK: "target-features"="+mmx,+sse,+x87"
diff --git a/test/CodeGen/attr-target-x86.c b/test/CodeGen/attr-target-x86.c
index 58e33d1cbfd84..7557ec7acdd50 100644
--- a/test/CodeGen/attr-target-x86.c
+++ b/test/CodeGen/attr-target-x86.c
@@ -18,6 +18,8 @@ int __attribute__((target("no-aes, arch=ivybridge"))) qax(int a) { return 4; }
 
 int __attribute__((target("no-mmx"))) qq(int a) { return 40; }
 
+int __attribute__((target("arch=lakemont"))) lake(int a) { return 4; }
+
 // Check that we emit the additional subtarget and cpu features for foo and not for baz or bar.
 // CHECK: baz{{.*}} #0
 // CHECK: foo{{.*}} #1
@@ -31,9 +33,11 @@ int __attribute__((target("no-mmx"))) qq(int a) { return 40; }
 // CHECK: qux{{.*}} #1
 // CHECK: qax{{.*}} #4
 // CHECK: qq{{.*}} #5
-// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2"
-// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt"
-// CHECK: #2 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,-aes,-avx,-avx2,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-f16c,-fma,-fma4,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-xop,-xsave,-xsaveopt"
-// CHECK: #3 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3"
-// CHECK: #4 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-aes"
-// CHECK: #5 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+sse,+sse2,-3dnow,-3dnowa,-mmx"
+// CHECK: lake{{.*}} #6
+// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
+// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+aes,+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
+// CHECK: #2 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+x87,-aes,-avx,-avx2,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-f16c,-fma,-fma4,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-xop,-xsave,-xsaveopt"
+// CHECK: #3 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87"
+// CHECK: #4 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes"
+// CHECK: #5 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+sse,+sse2,+x87,-3dnow,-3dnowa,-mmx"
+// CHECK: #6 = {{.*}}"target-cpu"="lakemont" "target-features"="+mmx,+sse,+sse2"
diff --git a/test/CodeGen/attr-target-x87-softfp.c b/test/CodeGen/attr-target-x87-softfp.c
new file mode 100644
index 0000000000000..16b7cfe8277fe
--- /dev/null
+++ b/test/CodeGen/attr-target-x87-softfp.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -target-cpu x86-64 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=HARD
+// RUN: %clang_cc1 -msoft-float -triple x86_64-linux-gnu -target-cpu x86-64 -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=SOFT
+
+int __attribute__((target("x87"))) foo(int a) { return 4; }
+int __attribute__((target("no-x87"))) bar(int a) { return 4; }
+
+// CHECK: foo{{.*}} #0
+// CHECK: bar{{.*}} #1
+
+// CHECK: #0 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87"
+// HARD: "use-soft-float"="false"
+// SOFT: "use-soft-float"="true"
+
+// CHECK: #1 = {{.*}}"target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,-x87"
+// HARD: "use-soft-float"="false"
+// SOFT: "use-soft-float"="true"
diff --git a/test/CodeGen/attr-used.c b/test/CodeGen/attr-used.c
index bc92b9435b32a..de38b51523a03 100644
--- a/test/CodeGen/attr-used.c
+++ b/test/CodeGen/attr-used.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm -o %t %s
+// RUN: grep '@llvm.used = .*@a0' %t
 // RUN: grep '@llvm.used = .*@g0' %t
 // RUN: grep '@llvm.used = .*@f0' %t
 // RUN: grep '@llvm.used = .*@f1.l0' %t
@@ -12,3 +13,6 @@ static void __attribute__((used)) f0(void) {
 void f1() { 
   static int l0 __attribute__((used)) = 5225; 
 }
+
+__attribute__((used)) int a0;
+void pr27535() { (void)a0; }
diff --git a/test/CodeGen/attr-x86-interrupt.c b/test/CodeGen/attr-x86-interrupt.c
new file mode 100644
index 0000000000000..0aca1f5b9c9cc
--- /dev/null
+++ b/test/CodeGen/attr-x86-interrupt.c
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_LINUX
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_LINUX
+// RUN: %clang_cc1 -triple x86_64-pc-win32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_WIN
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_WIN
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnux32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_LINUX
+
+#ifdef __x86_64__
+typedef __UINT64_TYPE__ uword;
+#else
+typedef __UINT32_TYPE__ uword;
+#endif
+
+__attribute__((interrupt)) void foo7(int *a, uword b) {}
+__attribute__((interrupt)) void foo8(int *a) {}
+// X86_64_LINUX: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i64)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
+// X86_64_LINUX: define x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
+// X86_64_LINUX: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_64_LINUX: "disable-tail-calls"="true"
+// X86_64_LINUX-NOT: "disable-tail-calls"="false"
+// X86_LINUX: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i32)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
+// X86_LINUX: define x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
+// X86_LINUX: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_LINUX: "disable-tail-calls"="true"
+// X86_LINUX-NOT: "disable-tail-calls"="false"
+// X86_64_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i64)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
+// X86_64_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i64 %{{.+}})
+// X86_64_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_64_Win: "disable-tail-calls"="true"
+// X86_64_Win-NOT: "disable-tail-calls"="false"
+// X86_WIN: @llvm.used = appending global [2 x i8*] [i8* bitcast (void (i32*, i32)* @foo7 to i8*), i8* bitcast (void (i32*)* @foo8 to i8*)], section "llvm.metadata"
+// X86_WIN: define x86_intrcc void @foo7(i32* %{{.+}}, i32 %{{.+}})
+// X86_WIN: define x86_intrcc void @foo8(i32* %{{.+}})
+// X86_Win: "disable-tail-calls"="true"
+// X86_Win-NOT: "disable-tail-calls"="false"
diff --git a/test/CodeGen/avx-builtins.c b/test/CodeGen/avx-builtins.c
index ee0f58fc716f4..bf3e8cc5db60b 100644
--- a/test/CodeGen/avx-builtins.c
+++ b/test/CodeGen/avx-builtins.c
@@ -1,154 +1,1323 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
-#include <immintrin.h>
+#include <x86intrin.h>
 
-//
-// Test LLVM IR codegen of shuffle instructions
-//
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
 
-__m256 test__mm256_loadu_ps(void* p) {
-  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 1
-  return _mm256_loadu_ps(p);
+__m256d test_mm256_add_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_add_pd
+  // CHECK: fadd <4 x double>
+  return _mm256_add_pd(A, B);
 }
 
-__m256d test__mm256_loadu_pd(void* p) {
-  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 1
-  return _mm256_loadu_pd(p);
+__m256 test_mm256_add_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_add_ps
+  // CHECK: fadd <8 x float>
+  return _mm256_add_ps(A, B);
 }
 
-__m256i test__mm256_loadu_si256(void* p) {
-  // CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1
-  return _mm256_loadu_si256(p);
+__m256d test_mm256_addsub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_addsub_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_addsub_pd(A, B);
 }
 
-__m128i test_mm_cmpestrm(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestrm128
-  return _mm_cmpestrm(A, LA, B, LB, 7);
+__m256 test_mm256_addsub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_addsub_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_addsub_ps(A, B);
 }
 
-int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestri128
-  return _mm_cmpestri(A, LA, B, LB, 7);
+__m256d test_mm256_and_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_and_pd
+  // CHECK: and <4 x i64>
+  return _mm256_and_pd(A, B);
 }
 
-int test_mm_cmpestra(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestria128
-  return _mm_cmpestra(A, LA, B, LB, 7);
+__m256 test_mm256_and_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_and_ps
+  // CHECK: and <8 x i32>
+  return _mm256_and_ps(A, B);
 }
 
-int test_mm_cmpestrc(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestric128
-  return _mm_cmpestrc(A, LA, B, LB, 7);
+__m256d test_mm256_andnot_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_andnot_pd
+  // CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <4 x i64>
+  return _mm256_andnot_pd(A, B);
 }
 
-int test_mm_cmpestro(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestrio128
-  return _mm_cmpestro(A, LA, B, LB, 7);
+__m256 test_mm256_andnot_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_andnot_ps
+  // CHECK: xor <8 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <8 x i32>
+  return _mm256_andnot_ps(A, B);
 }
 
-int test_mm_cmpestrs(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestris128
-  return _mm_cmpestrs(A, LA, B, LB, 7);
+__m256d test_mm256_blend_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_blend_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  return _mm256_blend_pd(A, B, 0x35);
 }
 
-int test_mm_cmpestrz(__m128i A, int LA, __m128i B, int LB) {
-  // CHECK: @llvm.x86.sse42.pcmpestriz128
-  return _mm_cmpestrz(A, LA, B, LB, 7);
+__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_blend_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+  return _mm256_blend_ps(A, B, 0x35);
 }
 
-__m128i test_mm_cmpistrm(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistrm128
-  return _mm_cmpistrm(A, B, 7);
+__m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
+  // CHECK-LABEL: test_mm256_blendv_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_blendv_pd(V1, V2, V3);
 }
 
-int test_mm_cmpistri(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistri128
-  return _mm_cmpistri(A, B, 7);
+__m256 test_mm256_blendv_ps(__m256 V1, __m256 V2, __m256 V3) {
+  // CHECK-LABEL: test_mm256_blendv_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_blendv_ps(V1, V2, V3);
 }
 
-int test_mm_cmpistra(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistria128
-  return _mm_cmpistra(A, B, 7);
+__m256d test_mm256_broadcast_pd(__m128d* A) {
+  // CHECK-LABEL: test_mm256_broadcast_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %{{.*}})
+  return _mm256_broadcast_pd(A);
 }
 
-int test_mm_cmpistrc(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistric128
-  return _mm_cmpistrc(A, B, 7);
+__m256 test_mm256_broadcast_ps(__m128* A) {
+  // CHECK-LABEL: test_mm256_broadcast_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8* %{{.*}})
+  return _mm256_broadcast_ps(A);
 }
 
-int test_mm_cmpistro(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistrio128
-  return _mm_cmpistro(A, B, 7);
+__m256d test_mm256_broadcast_sd(double* A) {
+  // CHECK-LABEL: test_mm256_broadcast_sd
+  // CHECK: load double, double* %{{.*}}
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_broadcast_sd(A);
 }
 
-int test_mm_cmpistrs(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistris128
-  return _mm_cmpistrs(A, B, 7);
+__m128d test_mm_broadcast_ss(float* A) {
+  // CHECK-LABEL: test_mm_broadcast_ss
+  // CHECK: load float, float* %{{.*}}
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_broadcast_ss(A);
 }
 
-int test_mm_cmpistrz(__m128i A, __m128i B) {
-  // CHECK: @llvm.x86.sse42.pcmpistriz128
-  return _mm_cmpistrz(A, B, 7);
+__m256d test_mm256_broadcast_ss(float* A) {
+  // CHECK-LABEL: test_mm256_broadcast_ss
+  // CHECK: load float, float* %{{.*}}
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_broadcast_ss(A);
 }
 
-int test_extract_epi32(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi32
-  // CHECK: [[SHIFT1:%[^ ]+]] = and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i32> %{{.*}}, i32 [[SHIFT1]]
-  return _mm256_extract_epi32(__a, 8);
+__m256 test_mm256_castpd_ps(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_ps
+  // CHECK: bitcast <4 x double> %{{.*}} to <8 x float>
+  return _mm256_castpd_ps(A);
 }
 
-int test_extract_epi16(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi16
-  // CHECK: [[SHIFT2:%[^ ]+]] = and i32 %{{.*}}, 15
-  // CHECK: extractelement <16 x i16> %{{.*}}, i32 [[SHIFT2]]
-  return _mm256_extract_epi16(__a, 16);
+__m256i test_mm256_castpd_si256(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd_si256
+  // CHECK: bitcast <4 x double> %{{.*}} to <4 x i64>
+  return _mm256_castpd_si256(A);
 }
 
-int test_extract_epi8(__m256i __a) {
-  // CHECK-LABEL: @test_extract_epi8
-  // CHECK: [[SHIFT3:%[^ ]+]] = and i32 %{{.*}}, 31
-  // CHECK: extractelement <32 x i8> %{{.*}}, i32 [[SHIFT3]]
-  return _mm256_extract_epi8(__a, 32);
+__m256d test_mm256_castpd128_pd256(__m128d A) {
+  // CHECK-LABEL: test_mm256_castpd128_pd256
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  return _mm256_castpd128_pd256(A);
 }
 
-__m256d test_256_blend_pd(__m256d __a, __m256d __b) {
-  // CHECK-LABEL: @test_256_blend_pd
-  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm256_blend_pd(__a, __b, 0x35);
+__m128d test_mm256_castpd256_pd128(__m256d A) {
+  // CHECK-LABEL: test_mm256_castpd256_pd128
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  return _mm256_castpd256_pd128(A);
 }
 
-__m256 test_256_blend_ps(__m256 __a, __m256 __b) {
-  // CHECK-LABEL: @test_256_blend_ps
-  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
-  return _mm256_blend_ps(__a, __b, 0x35);
+__m256d test_mm256_castps_pd(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_pd
+  // CHECK: bitcast <8 x float> %{{.*}} to <4 x double>
+  return _mm256_castps_pd(A);
+}
+
+__m256i test_mm256_castps_si256(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps_si256
+  // CHECK: bitcast <8 x float> %{{.*}} to <4 x i64>
+  return _mm256_castps_si256(A);
+}
+
+__m256 test_mm256_castps128_ps256(__m128 A) {
+  // CHECK-LABEL: test_mm256_castps128_ps256
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm256_castps128_ps256(A);
+}
+
+__m128 test_mm256_castps256_ps128(__m256 A) {
+  // CHECK-LABEL: test_mm256_castps256_ps128
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm256_castps256_ps128(A);
+}
+
+__m256i test_mm256_castsi128_si256(__m128i A) {
+  // CHECK-LABEL: test_mm256_castsi128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  return _mm256_castsi128_si256(A);
+}
+
+__m256d test_mm256_castsi256_pd(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_pd
+  // CHECK: bitcast <4 x i64> %{{.*}} to <4 x double>
+  return _mm256_castsi256_pd(A);
+}
+
+__m256 test_mm256_castsi256_ps(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_ps
+  // CHECK: bitcast <4 x i64> %{{.*}} to <8 x float>
+  return _mm256_castsi256_ps(A);
+}
+
+__m128i test_mm256_castsi256_si128(__m256i A) {
+  // CHECK-LABEL: test_mm256_castsi256_si128
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  return _mm256_castsi256_si128(A);
+}
+
+__m256d test_mm256_ceil_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_ceil_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+  return _mm256_ceil_pd(x);
+}
+
+__m256 test_mm_ceil_ps(__m256 x) {
+  // CHECK-LABEL: test_mm_ceil_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
+  return _mm256_ceil_ps(x);
+}
+
+__m128d test_mm_cmp_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_cmp_pd
+  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
+  return _mm_cmp_pd(A, B, _CMP_GE_OS);
+}
+
+__m256d test_mm256_cmp_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_cmp_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 13)
+  return _mm256_cmp_pd(A, B, _CMP_GE_OS);
+}
+
+__m128 test_mm_cmp_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_cmp_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
+  return _mm_cmp_ps(A, B, _CMP_GE_OS);
+}
+
+__m256 test_mm256_cmp_ps(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_cmp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 13)
+  return _mm256_cmp_ps(A, B, _CMP_GE_OS);
+}
+
+__m128d test_mm_cmp_sd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_cmp_sd
+  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 13)
+  return _mm_cmp_sd(A, B, _CMP_GE_OS);
+}
+
+__m128 test_mm_cmp_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_cmp_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 13)
+  return _mm_cmp_ss(A, B, _CMP_GE_OS);
+}
+
+__m256d test_mm256_cvtepi32_pd(__m128i A) {
+  // CHECK-LABEL: test_mm256_cvtepi32_pd
+  // CHECK: sitofp <4 x i32> %{{.*}} to <4 x double>
+  return _mm256_cvtepi32_pd(A);
+}
+
+__m256 test_mm256_cvtepi32_ps(__m256i A) {
+  // CHECK-LABEL: test_mm256_cvtepi32_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.cvtdq2.ps.256(<8 x i32> %{{.*}})
+  return _mm256_cvtepi32_ps(A);
+}
+
+__m128i test_mm256_cvtpd_epi32(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvtpd_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %{{.*}})
+  return _mm256_cvtpd_epi32(A);
+}
+
+__m128 test_mm256_cvtpd_ps(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvtpd_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %{{.*}})
+  return _mm256_cvtpd_ps(A);
+}
+
+__m256i test_mm256_cvtps_epi32(__m256 A) {
+  // CHECK-LABEL: test_mm256_cvtps_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %{{.*}})
+  return _mm256_cvtps_epi32(A);
+}
+
+__m256d test_mm256_cvtps_pd(__m128 A) {
+  // CHECK-LABEL: test_mm256_cvtps_pd
+  // CHECK: fpext <4 x float> %{{.*}} to <4 x double>
+  return _mm256_cvtps_pd(A);
+}
+
+__m128i test_mm256_cvttpd_epi32(__m256d A) {
+  // CHECK-LABEL: test_mm256_cvttpd_epi32
+  // CHECK: fptosi <4 x double> %{{.*}} to <4 x i32>
+  return _mm256_cvttpd_epi32(A);
+}
+
+__m256i test_mm256_cvttps_epi32(__m256 A) {
+  // CHECK-LABEL: test_mm256_cvttps_epi32
+  // CHECK: fptosi <8 x float> %{{.*}} to <8 x i32>
+  return _mm256_cvttps_epi32(A);
+}
+
+__m256d test_mm256_div_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_div_pd
+  // CHECK: fdiv <4 x double>
+  return _mm256_div_pd(A, B);
+}
+
+__m256 test_mm256_div_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_div_ps
+  // CHECK: fdiv <8 x float>
+  return _mm256_div_ps(A, B);
+}
+
+__m256 test_mm256_dp_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_dp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> {{.*}}, <8 x float> {{.*}}, i8 7)
+  return _mm256_dp_ps(A, B, 7);
+}
+
+int test_mm256_extract_epi8(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi8
+  // CHECK: and i32 %{{.*}}, 31
+  // CHECK: extractelement <32 x i8> %{{.*}}, i32 %{{.*}}
+  // CHECK: zext i8 %{{.*}} to i32
+  return _mm256_extract_epi8(A, 32);
+}
+
+int test_mm256_extract_epi16(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi16
+  // CHECK: and i32 %{{.*}}, 15
+  // CHECK: extractelement <16 x i16> %{{.*}}, i32 %{{.*}}
+  // CHECK: zext i16 %{{.*}} to i32
+  return _mm256_extract_epi16(A, 16);
+}
+
+int test_mm256_extract_epi32(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi32
+  // CHECK: and i32 %{{.*}}, 7
+  // CHECK: extractelement <8 x i32> %{{.*}}, i32 %{{.*}}
+  return _mm256_extract_epi32(A, 8);
+}
+
+long long test_mm256_extract_epi64(__m256i A) {
+  // CHECK-LABEL: test_mm256_extract_epi64
+  // CHECK: and i32 %{{.*}}, 3
+  // CHECK: extractelement <4 x i64> %{{.*}}, i32 %{{.*}}
+  return _mm256_extract_epi64(A, 5);
+}
+
+__m128d test_mm256_extractf128_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_extractf128_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  return _mm256_extractf128_pd(A, 1);
+}
+
+__m128 test_mm256_extractf128_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_extractf128_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  return _mm256_extractf128_ps(A, 1);
+}
+
+__m128i test_mm256_extractf128_si256(__m256i A) {
+  // CHECK-LABEL: test_mm256_extractf128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  return _mm256_extractf128_si256(A, 1);
+}
+
+__m256d test_mm256_floor_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_floor_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
+  return _mm256_floor_pd(x);
+}
+
+__m256 test_mm_floor_ps(__m256 x) {
+  // CHECK-LABEL: test_mm_floor_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
+  return _mm256_floor_ps(x);
+}
+
+__m256d test_mm256_hadd_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_hadd_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_hadd_pd(A, B);
+}
+
+__m256 test_mm256_hadd_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_hadd_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_hadd_ps(A, B);
+}
+
+__m256d test_mm256_hsub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_hsub_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_hsub_pd(A, B);
+}
+
+__m256 test_mm256_hsub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_hsub_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_hsub_ps(A, B);
+}
+
+__m256i test_mm256_insert_epi8(__m256i x, char b) {
+  // CHECK-LABEL: test_mm256_insert_epi8
+  // CHECK: and i32 %{{.*}}, 31
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi8(x, b, 17);
+}
+
+__m256i test_mm256_insert_epi16(__m256i x, int b) {
+  // CHECK-LABEL: test_mm256_insert_epi16
+  // CHECK: and i32 %{{.*}}, 15
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi16(x, b, 4);
+}
+
+__m256i test_mm256_insert_epi32(__m256i x, int b) {
+  // CHECK-LABEL: test_mm256_insert_epi32
+  // CHECK: and i32 %{{.*}}, 7
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi32(x, b, 5);
+}
+
+__m256i test_mm256_insert_epi64(__m256i x, long long b) {
+  // CHECK-LABEL: test_mm256_insert_epi64
+  // CHECK: and i32 %{{.*}}, 3
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 %{{.*}}
+  return _mm256_insert_epi64(x, b, 2);
+}
+
+__m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_insertf128_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_pd(A, B, 0);
+}
+
+__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_insertf128_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_insertf128_ps(A, B, 1);
+}
+
+__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_insertf128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_insertf128_si256(A, B, 0);
+}
+
+__m256i test_mm256_lddqu_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_lddqu_si256
+  // CHECK: call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %{{.*}})
+  return _mm256_lddqu_si256(A);
+}
+
+__m256d test_mm256_load_pd(double* A) {
+  // CHECK-LABEL: test_mm256_load_pd
+  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 32
+  return _mm256_load_pd(A);
+}
+
+__m256 test_mm256_load_ps(float* A) {
+  // CHECK-LABEL: test_mm256_load_ps
+  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 32
+  return _mm256_load_ps(A);
+}
+
+__m256i test_mm256_load_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_load_si256
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.*}}, align 32
+  return _mm256_load_si256(A);
+}
+
+__m256d test_mm256_loadu_pd(double* A) {
+  // CHECK-LABEL: test_mm256_loadu_pd
+  // CHECK: load <4 x double>, <4 x double>* %{{.*}}, align 1{{$}}
+  return _mm256_loadu_pd(A);
+}
+
+__m256 test_mm256_loadu_ps(float* A) {
+  // CHECK-LABEL: test_mm256_loadu_ps
+  // CHECK: load <8 x float>, <8 x float>* %{{.*}}, align 1{{$}}
+  return _mm256_loadu_ps(A);
+}
+
+__m256i test_mm256_loadu_si256(__m256i* A) {
+  // CHECK-LABEL: test_mm256_loadu_si256
+  // CHECK: load <4 x i64>, <4 x i64>* %{{.+}}, align 1{{$}}
+  return _mm256_loadu_si256(A);
+}
+
+__m256 test_mm256_loadu2_m128(float* A, float* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  return _mm256_loadu2_m128(A, B);
+}
+
+__m256d test_mm256_loadu2_m128d(double* A, double* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128d
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm256_loadu2_m128d(A, B);
+}
+
+__m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) {
+  // CHECK-LABEL: test_mm256_loadu2_m128i
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm256_loadu2_m128i(A, B);
+}
+
+__m128d test_mm_maskload_pd(double* A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskload_pd
+  // CHECK: call <2 x double> @llvm.x86.avx.maskload.pd(i8* %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskload_pd(A, B);
+}
+
+__m256d test_mm256_maskload_pd(double* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskload_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskload_pd(A, B);
+}
+
+__m128 test_mm_maskload_ps(float* A, __m128i B) {
+  // CHECK-LABEL: test_mm_maskload_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.maskload.ps(i8* %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskload_ps(A, B);
+}
+
+__m256d test_mm256_maskload_ps(float* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_maskload_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskload_ps(A, B);
+}
+
+void test_mm_maskstore_pd(double* A, __m128i B, __m128d C) {
+  // CHECK-LABEL: test_mm_maskstore_pd
+  // CHECK: call void @llvm.x86.avx.maskstore.pd(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}})
+  _mm_maskstore_pd(A, B, C);
+}
+
+void test_mm256_maskstore_pd(double* A, __m256i B, __m256d C) {
+  // CHECK-LABEL: test_mm256_maskstore_pd
+  // CHECK: call void @llvm.x86.avx.maskstore.pd.256(i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}})
+  _mm256_maskstore_pd(A, B, C);
+}
+
+void test_mm_maskstore_ps(float* A, __m128i B, __m128 C) {
+  // CHECK-LABEL: test_mm_maskstore_ps
+  // CHECK: call void @llvm.x86.avx.maskstore.ps(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}})
+  _mm_maskstore_ps(A, B, C);
+}
+
+void test_mm256_maskstore_ps(float* A, __m256i B, __m256 C) {
+  // CHECK-LABEL: test_mm256_maskstore_ps
+  // CHECK: call void @llvm.x86.avx.maskstore.ps.256(i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}})
+  _mm256_maskstore_ps(A, B, C);
+}
+
+__m256d test_mm256_max_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_max_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_max_pd(A, B);
+}
+
+__m256 test_mm256_max_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_max_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_max_ps(A, B);
+}
+
+__m256d test_mm256_min_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_min_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_min_pd(A, B);
+}
+
+__m256 test_mm256_min_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_min_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_min_ps(A, B);
+}
+
+__m256d test_mm256_movedup_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  return _mm256_movedup_pd(A);
+}
+
+__m256 test_mm256_movehdup_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  return _mm256_movehdup_ps(A);
+}
+
+__m256 test_mm256_moveldup_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm256_moveldup_ps(A);
+}
+
+int test_mm256_movemask_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_movemask_pd
+  // CHECK: call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %{{.*}})
+  return _mm256_movemask_pd(A);
+}
+
+int test_mm256_movemask_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_movemask_ps
+  // CHECK: call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %{{.*}})
+  return _mm256_movemask_ps(A);
+}
+
+__m256d test_mm256_mul_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_mul_pd
+  // CHECK: fmul <4 x double>
+  return _mm256_mul_pd(A, B);
+}
+
+__m256 test_mm256_mul_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_mul_ps
+  // CHECK: fmul <8 x float>
+  return _mm256_mul_ps(A, B);
+}
+
+__m256d test_mm256_or_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_or_pd
+  // CHECK: or <4 x i64>
+  return _mm256_or_pd(A, B);
+}
+
+__m256 test_mm256_or_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_or_ps
+  // CHECK: or <8 x i32>
+  return _mm256_or_ps(A, B);
+}
+
+__m128d test_mm_permute_pd(__m128d A) {
+  // CHECK-LABEL: test_mm_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  return _mm_permute_pd(A, 1);
+}
+
+__m256d test_mm256_permute_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  return _mm256_permute_pd(A, 5);
 }
 
-__m256i test_256_insert_epi8(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi8
-  // CHECK: insertelement <32 x i8> {{.*}}, i8 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi8(__a, 42, 3);
+__m128 test_mm_permute_ps(__m128 A) {
+  // CHECK-LABEL: test_mm_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  return _mm_permute_ps(A, 0x1b);
 }
 
-__m256i test_256_insert_epi16(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi16
-  // CHECK: insertelement <16 x i16> {{.*}}, i16 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi16(__a, 42, 3);
+// Test case for PR12401
+__m128 test2_mm_permute_ps(__m128 a) {
+  // CHECK-LABEL: test2_mm_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
+  return _mm_permute_ps(a, 0xe6);
 }
 
-__m256i test_256_insert_epi32(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi32
-  // CHECK: insertelement <8 x i32> {{.*}}, i32 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi32(__a, 42, 3);
+__m256 test_mm256_permute_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  return _mm256_permute_ps(A, 0x1b);
 }
 
-__m256i test_256_insert_epi64(__m256i __a) {
-  // CHECK-LABEL: @test_256_insert_epi64
-  // CHECK: insertelement <4 x i64> {{.*}}, i64 {{.*}}, i32 {{.*}}
-  return _mm256_insert_epi64(__a, 42, 3);
+__m256d test_mm256_permute2f128_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_permute2f128_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, i8 49)
+  return _mm256_permute2f128_pd(A, B, 0x31);
+}
+
+__m256 test_mm256_permute2f128_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_permute2f128_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, i8 19)
+  return _mm256_permute2f128_ps(A, B, 0x13);
+}
+
+__m256i test_mm256_permute2f128_si256(__m256i A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permute2f128_si256
+  // CHECK: call <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 32)
+  return _mm256_permute2f128_si256(A, B, 0x20);
+}
+
+__m128d test_mm_permutevar_pd(__m128d A, __m128i B) {
+  // CHECK-LABEL: test_mm_permutevar_pd
+  // CHECK: call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_permutevar_pd(A, B);
+}
+
+__m256d test_mm256_permutevar_pd(__m256d A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permutevar_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_permutevar_pd(A, B);
+}
+
+__m128 test_mm_permutevar_ps(__m128 A, __m128i B) {
+  // CHECK-LABEL: test_mm_permutevar_ps
+  // CHECK: call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_permutevar_ps(A, B);
+}
+
+__m256 test_mm256_permutevar_ps(__m256 A, __m256i B) {
+  // CHECK-LABEL: test_mm256_permutevar_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar_ps(A, B);
+}
+
+__m256 test_mm256_rcp_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_rcp_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %{{.*}})
+  return _mm256_rcp_ps(A);
+}
+
+__m256d test_mm256_round_pd(__m256d x) {
+  // CHECK-LABEL: test_mm256_round_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
+  return _mm256_round_pd(x, 4);
+}
+
+__m256 test_mm256_round_ps(__m256 x) {
+  // CHECK-LABEL: test_mm256_round_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
+  return _mm256_round_ps(x, 4);
+}
+
+__m256 test_mm256_rsqrt_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_rsqrt_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %{{.*}})
+  return _mm256_rsqrt_ps(A);
+}
+
+__m256i test_mm256_set_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
+                            char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
+                            char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
+                            char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
+  // CHECK-LABEL: test_mm256_set_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_set_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
+}
+
+__m256i test_mm256_set_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
+                             short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
+  // CHECK-LABEL: test_mm256_set_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_set_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
+}
+
+__m256i test_mm256_set_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
+  // CHECK-LABEL: test_mm256_set_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_set_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_set_epi64x(long long A0, long long A1, long long A2, long long A3) {
+  // CHECK-LABEL: test_mm256_set_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_set_epi64x(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_set_m128(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_set_m128
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128(A, B);
+}
+
+__m256d test_mm256_set_m128d(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_set_m128d
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128d(A, B);
+}
+
+__m256i test_mm256_set_m128i(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_set_m128i
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_set_m128i(A, B);
+}
+
+__m256d test_mm256_set_pd(double A0, double A1, double A2, double A3) {
+  // CHECK-LABEL: test_mm256_set_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_set_pd(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_set_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
+  // CHECK-LABEL: test_mm256_set_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_set_ps(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_set1_epi8(char A) {
+  // CHECK-LABEL: test_mm256_set1_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_set1_epi8(A);
+}
+
+__m256i test_mm256_set1_epi16(short A) {
+  // CHECK-LABEL: test_mm256_set1_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_set1_epi16(A);
+}
+
+__m256i test_mm256_set1_epi32(int A) {
+  // CHECK-LABEL: test_mm256_set1_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_set1_epi32(A);
+}
+
+__m256i test_mm256_set1_epi64x(long long A) {
+  // CHECK-LABEL: test_mm256_set1_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_set1_epi64x(A);
+}
+
+__m256d test_mm256_set1_pd(double A) {
+  // CHECK-LABEL: test_mm256_set1_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_set1_pd(A);
+}
+
+__m256 test_mm256_set1_ps(float A) {
+  // CHECK-LABEL: test_mm256_set1_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_set1_ps(A);
+}
+
+__m256i test_mm256_setr_epi8(char A0, char A1, char A2, char A3, char A4, char A5, char A6, char A7,
+                             char A8, char A9, char A10, char A11, char A12, char A13, char A14, char A15,
+                             char A16, char A17, char A18, char A19, char A20, char A21, char A22, char A23,
+                             char A24, char A25, char A26, char A27, char A28, char A29, char A30, char A31) {
+  // CHECK-LABEL: test_mm256_setr_epi8
+  // CHECK: insertelement <32 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 16
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 17
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 18
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 19
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 20
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 21
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 22
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 23
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 24
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 25
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 26
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 27
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 28
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 29
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 30
+  // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 31
+  return _mm256_setr_epi8(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15, A16, A17, A18, A19, A20, A21, A22, A23, A24, A25, A26, A27, A28, A29, A30, A31);
+}
+
+__m256i test_mm256_setr_epi16(short A0, short A1, short A2, short A3, short A4, short A5, short A6, short A7,
+                              short A8, short A9, short A10, short A11, short A12, short A13, short A14, short A15) {
+  // CHECK-LABEL: test_mm256_setr_epi16
+  // CHECK: insertelement <16 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 15
+  return _mm256_setr_epi16(A0, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, A11, A12, A13, A14, A15);
+}
+
+__m256i test_mm256_setr_epi32(int A0, int A1, int A2, int A3, int A4, int A5, int A6, int A7) {
+  // CHECK-LABEL: test_mm256_setr_epi32
+  // CHECK: insertelement <8 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i32> %{{.*}}, i32 %{{.*}}, i32 7
+  return _mm256_setr_epi32(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256i test_mm256_setr_epi64x(long long A0, long long A1, long long A2, long long A3) {
+  // CHECK-LABEL: test_mm256_setr_epi64x
+  // CHECK: insertelement <4 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i64> %{{.*}}, i64 %{{.*}}, i32 3
+  return _mm256_setr_epi64x(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_setr_m128(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm256_setr_m128
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128(A, B);
+}
+
+__m256d test_mm256_setr_m128d(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm256_setr_m128d
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128d(A, B);
+}
+
+__m256i test_mm256_setr_m128i(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm256_setr_m128i
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm256_setr_m128i(A, B);
+}
+
+__m256d test_mm256_setr_pd(double A0, double A1, double A2, double A3) {
+  // CHECK-LABEL: test_mm256_setr_pd
+  // CHECK: insertelement <4 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 1
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 2
+  // CHECK: insertelement <4 x double> %{{.*}}, double %{{.*}}, i32 3
+  return _mm256_setr_pd(A0, A1, A2, A3);
+}
+
+__m256 test_mm256_setr_ps(float A0, float A1, float A2, float A3, float A4, float A5, float A6, float A7) {
+  // CHECK-LABEL: test_mm256_setr_ps
+  // CHECK: insertelement <8 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 3
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 4
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 5
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 6
+  // CHECK: insertelement <8 x float> %{{.*}}, float %{{.*}}, i32 7
+  return _mm256_setr_ps(A0, A1, A2, A3, A4, A5, A6, A7);
+}
+
+__m256d test_mm256_setzero_pd() {
+  // CHECK-LABEL: test_mm256_setzero_pd
+  // CHECK: store <4 x double> zeroinitializer
+  return _mm256_setzero_pd();
+}
+
+__m256 test_mm256_setzero_ps() {
+  // CHECK-LABEL: test_mm256_setzero_ps
+  // CHECK: store <8 x float> zeroinitializer
+  return _mm256_setzero_ps();
+}
+
+__m256i test_mm256_setzero_si256() {
+  // CHECK-LABEL: test_mm256_setzero_si256
+  // CHECK: store <4 x i64> zeroinitializer
+  return _mm256_setzero_si256();
+}
+
+__m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_shuffle_pd(A, B, 0);
+}
+
+__m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
+  return _mm256_shuffle_ps(A, B, 0);
+}
+
+__m256d test_mm256_sqrt_pd(__m256d A) {
+  // CHECK-LABEL: test_mm256_sqrt_pd
+  // CHECK: call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %{{.*}})
+  return _mm256_sqrt_pd(A);
+}
+
+__m256 test_mm256_sqrt_ps(__m256 A) {
+  // CHECK-LABEL: test_mm256_sqrt_ps
+  // CHECK: call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %{{.*}})
+  return _mm256_sqrt_ps(A);
+}
+
+void test_mm256_store_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_store_pd
+  // CHECK: store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 32
+  _mm256_store_pd(A, B);
+}
+
+void test_mm256_store_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_store_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 32
+  _mm256_store_ps(A, B);
+}
+
+void test_mm256_store_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_store_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 32
+  _mm256_store_si256(A, B);
+}
+
+void test_mm256_storeu_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_storeu_pd
+  // CHECK:   store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm256_storeu_pd(A, B);
+}
+
+void test_mm256_storeu_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_storeu_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 1{{$}}
+  // CHECk-NEXT: ret void
+  _mm256_storeu_ps(A, B);
+}
+
+void test_mm256_storeu_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_storeu_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 1{{$}}
+  // CHECk-NEXT: ret void
+  _mm256_storeu_si256(A, B);
+}
+
+void test_mm256_storeu2_m128(float* A, float* B, __m256 C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128(A, B, C);
+}
+
+void test_mm256_storeu2_m128d(double* A, double* B, __m256d C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128d
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128d(A, B, C);
+}
+
+void test_mm256_storeu2_m128i(__m128i* A, __m128i* B, __m256i C) {
+  // CHECK-LABEL: test_mm256_storeu2_m128i
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  _mm256_storeu2_m128i(A, B, C);
+}
+
+void test_mm256_stream_pd(double* A, __m256d B) {
+  // CHECK-LABEL: test_mm256_stream_pd
+  // CHECK: store <4 x double> %{{.*}}, <4 x double>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_pd(A, B);
+}
+
+void test_mm256_stream_ps(float* A, __m256 B) {
+  // CHECK-LABEL: test_mm256_stream_ps
+  // CHECK: store <8 x float> %{{.*}}, <8 x float>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_ps(A, B);
+}
+
+void test_mm256_stream_si256(__m256i* A, __m256i B) {
+  // CHECK-LABEL: test_mm256_stream_si256
+  // CHECK: store <4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, align 32, !nontemporal
+  _mm256_stream_si256(A, B);
+}
+
+__m256d test_mm256_sub_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_sub_pd
+  // CHECK: fsub <4 x double>
+  return _mm256_sub_pd(A, B);
+}
+
+__m256 test_mm256_sub_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_sub_ps
+  // CHECK: fsub <8 x float>
+  return _mm256_sub_ps(A, B);
+}
+
+int test_mm_testc_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testc_pd(A, B);
+}
+
+int test_mm256_testc_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testc_pd(A, B);
+}
+
+int test_mm_testc_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testc_ps(A, B);
+}
+
+int test_mm256_testc_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testc_ps(A, B);
+}
+
+int test_mm256_testc_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testc_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testc_si256(A, B);
+}
+
+int test_mm_testnzc_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testnzc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testnzc_pd(A, B);
+}
+
+int test_mm256_testnzc_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testnzc_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testnzc_pd(A, B);
+}
+
+int test_mm_testnzc_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testnzc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testnzc_ps(A, B);
+}
+
+int test_mm256_testnzc_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testnzc_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testnzc_ps(A, B);
+}
+
+int test_mm256_testnzc_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testnzc_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testnzc_si256(A, B);
+}
+
+int test_mm_testz_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_testz_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_testz_pd(A, B);
+}
+
+int test_mm256_testz_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_testz_pd
+  // CHECK: call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_testz_pd(A, B);
+}
+
+int test_mm_testz_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_testz_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_testz_ps(A, B);
+}
+
+int test_mm256_testz_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testz_ps
+  // CHECK: call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_testz_ps(A, B);
+}
+
+int test_mm256_testz_si256(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_testz_si256
+  // CHECK: call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_testz_si256(A, B);
 }
 
 __m256 test_mm256_undefined_ps() {
@@ -168,3 +1337,72 @@ __m256i test_mm256_undefined_si256() {
   // CHECK: ret <4 x i64> undef
   return _mm256_undefined_si256();
 }
+
+__m256d test_mm256_unpackhi_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_unpackhi_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  return _mm256_unpackhi_pd(A, B);
+}
+
+__m256 test_mm256_unpackhi_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  return _mm256_unpackhi_ps(A, B);
+}
+
+__m256d test_mm256_unpacklo_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_unpacklo_pd(A, B);
+}
+
+__m256 test_mm256_unpacklo_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  return _mm256_unpacklo_ps(A, B);
+}
+
+__m256d test_mm256_xor_pd(__m256d A, __m256d B) {
+  // CHECK-LABEL: test_mm256_xor_pd
+  // CHECK: xor <4 x i64>
+  return _mm256_xor_pd(A, B);
+}
+
+__m256 test_mm256_xor_ps(__m256 A, __m256 B) {
+  // CHECK-LABEL: test_mm256_xor_ps
+  // CHECK: xor <8 x i32>
+  return _mm256_xor_ps(A, B);
+}
+
+void test_mm256_zeroall() {
+  // CHECK-LABEL: test_mm256_zeroall
+  // CHECK: call void @llvm.x86.avx.vzeroall()
+  return _mm256_zeroall();
+}
+
+void test_mm256_zeroupper() {
+  // CHECK-LABEL: test_mm256_zeroupper
+  // CHECK: call void @llvm.x86.avx.vzeroupper()
+  return _mm256_zeroupper();
+}
+
+double test_mm256_cvtsd_f64(__m256d __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtsd_f64
+ // CHECK: extractelement <4 x double> %{{.*}}, i32 0
+ return _mm256_cvtsd_f64(__a);
+}
+
+int test_mm256_cvtsi256_si32(__m256i __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtsi256_si32
+ // CHECK: extractelement <8 x i32> %{{.*}}, i32 0
+ return _mm256_cvtsi256_si32(__a);
+}
+
+float test_mm256_cvtss_f32(__m256 __a)
+{
+ // CHECK-LABEL: @test_mm256_cvtss_f32
+ // CHECK: extractelement <8 x float> %{{.*}}, i32 0
+ return _mm256_cvtss_f32(__a);
+}
diff --git a/test/CodeGen/avx2-builtins.c b/test/CodeGen/avx2-builtins.c
index 89981bb760f1f..1e71245373311 100644
--- a/test/CodeGen/avx2-builtins.c
+++ b/test/CodeGen/avx2-builtins.c
@@ -4,1036 +4,1223 @@
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
-#include <immintrin.h>
+#include <x86intrin.h>
 
-__m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
-  // CHECK: @llvm.x86.avx2.mpsadbw({{.*}}, {{.*}}, i8 3)
-  return _mm256_mpsadbw_epu8(x, y, 3);
-}
-
-__m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
-  // CHECK: @llvm.x86.avx2.psad.bw
-  return _mm256_sad_epu8(x, y);
-}
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
 
 __m256i test_mm256_abs_epi8(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.b
+  // CHECK-LABEL: test_mm256_abs_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %{{.*}})
   return _mm256_abs_epi8(a);
 }
 
 __m256i test_mm256_abs_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.w
+  // CHECK-LABEL: test_mm256_abs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %{{.*}})
   return _mm256_abs_epi16(a);
 }
 
 __m256i test_mm256_abs_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pabs.d
+  // CHECK-LABEL: test_mm256_abs_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %{{.*}})
   return _mm256_abs_epi32(a);
 }
 
-__m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packsswb
-  return _mm256_packs_epi16(a, b);
-}
-
-__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packssdw
-  return _mm256_packs_epi32(a, b);
-}
-
-__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packuswb
-  return _mm256_packus_epi16(a, b);
-}
-
-__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.packusdw
-  return _mm256_packus_epi32(a, b);
-}
-
 __m256i test_mm256_add_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi8
   // CHECK: add <32 x i8>
   return _mm256_add_epi8(a, b);
 }
 
 __m256i test_mm256_add_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi16
   // CHECK: add <16 x i16>
   return _mm256_add_epi16(a, b);
 }
 
 __m256i test_mm256_add_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi32
   // CHECK: add <8 x i32>
   return _mm256_add_epi32(a, b);
 }
 
 __m256i test_mm256_add_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_add_epi64
   // CHECK: add <4 x i64>
   return _mm256_add_epi64(a, b);
 }
 
 __m256i test_mm256_adds_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.padds.b
+  // CHECK-LABEL: test_mm256_adds_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epi8(a, b);
 }
 
 __m256i test_mm256_adds_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.padds.w
+  // CHECK-LABEL: test_mm256_adds_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epi16(a, b);
 }
 
 __m256i test_mm256_adds_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.paddus.b
+  // CHECK-LABEL: test_mm256_adds_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_adds_epu8(a, b);
 }
 
 __m256i test_mm256_adds_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.paddus.w
+  // CHECK-LABEL: test_mm256_adds_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_adds_epu16(a, b);
 }
 
 __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_alignr_epi8
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
   return _mm256_alignr_epi8(a, b, 2);
 }
 
 __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test2_mm256_alignr_epi8
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
   return _mm256_alignr_epi8(a, b, 17);
 }
 
-__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
-  // CHECK: sub <32 x i8>
-  return _mm256_sub_epi8(a, b);
+__m256i test_mm256_and_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_and_si256
+  // CHECK: and <4 x i64>
+  return _mm256_and_si256(a, b);
 }
 
-__m256i test_mm256_sub_epi16(__m256i a, __m256i b) {
-  // CHECK: sub <16 x i16>
-  return _mm256_sub_epi16(a, b);
+__m256i test_mm256_andnot_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_andnot_si256
+  // CHECK: xor <4 x i64>
+  // CHECK: and <4 x i64>
+  return _mm256_andnot_si256(a, b);
 }
 
-__m256i test_mm256_sub_epi32(__m256i a, __m256i b) {
-  // CHECK: sub <8 x i32>
-  return _mm256_sub_epi32(a, b);
+__m256i test_mm256_avg_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_avg_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_avg_epu8(a, b);
 }
 
-__m256i test_mm256_sub_epi64(__m256i a, __m256i b) {
-  // CHECK: sub <4 x i64>
-  return _mm256_sub_epi64(a, b);
+__m256i test_mm256_avg_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_avg_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_avg_epu16(a, b);
 }
 
-__m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubs.b
-  return _mm256_subs_epi8(a, b);
+// FIXME: We should also lower the __builtin_ia32_pblendw128 (and similar)
+// functions to this IR. In the future we could delete the corresponding
+// intrinsic in LLVM if it's not being used anymore.
+__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_blend_epi16
+  // CHECK-NOT: @llvm.x86.avx2.pblendw
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_blend_epi16(a, b, 2);
 }
 
-__m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubs.w
-  return _mm256_subs_epi16(a, b);
+__m128i test_mm_blend_epi32(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_blend_epi32
+  // CHECK-NOT: @llvm.x86.avx2.pblendd.128
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+  return _mm_blend_epi32(a, b, 0x35);
 }
 
-__m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubus.b
-  return _mm256_subs_epu8(a, b);
+__m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_blend_epi32
+  // CHECK-NOT: @llvm.x86.avx2.pblendd.256
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
+  return _mm256_blend_epi32(a, b, 0x35);
 }
 
-__m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psubus.w
-  return _mm256_subs_epu16(a, b);
+__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
+  // CHECK-LABEL: test_mm256_blendv_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_blendv_epi8(a, b, m);
 }
 
-__m256i test_mm256_and_si256(__m256i a, __m256i b) {
-  // CHECK: and <4 x i64>
-  return _mm256_and_si256(a, b);
+__m128i test_mm_broadcastb_epi8(__m128i a) {
+  // CHECK-LABEL: test_mm_broadcastb_epi8
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
+  return _mm_broadcastb_epi8(a);
 }
 
-__m256i test_mm256_andnot_si256(__m256i a, __m256i b) {
-  // CHECK: xor <4 x i64>
-  // CHECK: and <4 x i64>
-  return _mm256_andnot_si256(a, b);
+__m256i test_mm256_broadcastb_epi8(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastb_epi8
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  return _mm256_broadcastb_epi8(a);
 }
 
-__m256i test_mm256_or_si256(__m256i a, __m256i b) {
-  // CHECK: or <4 x i64>
-  return _mm256_or_si256(a, b);
+__m128i test_mm_broadcastd_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm_broadcastd_epi32
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.128
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm_broadcastd_epi32(a);
 }
 
-__m256i test_mm256_xor_si256(__m256i a, __m256i b) {
-  // CHECK: xor <4 x i64>
-  return _mm256_xor_si256(a, b);
+__m256i test_mm256_broadcastd_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastd_epi32
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm256_broadcastd_epi32(a);
 }
 
-__m256i test_mm256_avg_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pavg.b
-  return _mm256_avg_epu8(a, b);
+__m128i test_mm_broadcastq_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm_broadcastq_epi64
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.128
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
+  return _mm_broadcastq_epi64(a);
 }
 
-__m256i test_mm256_avg_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pavg.w
-  return _mm256_avg_epu16(a, b);
+__m256i test_mm256_broadcastq_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastq_epi64
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm256_broadcastq_epi64(a);
 }
 
-__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
-  // CHECK: @llvm.x86.avx2.pblendvb
-  return _mm256_blendv_epi8(a, b, m);
+__m128d test_mm_broadcastsd_pd(__m128d a) {
+  // CHECK-LABEL: test_mm_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  return _mm_broadcastsd_pd(a);
 }
 
-// FIXME: We should also lower the __builtin_ia32_pblendw128 (and similar)
-// functions to this IR. In the future we could delete the corresponding
-// intrinsic in LLVM if it's not being used anymore.
-__m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_blend_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pblendw
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  return _mm256_blend_epi16(a, b, 2);
+__m256d test_mm256_broadcastsd_pd(__m128d a) {
+  // CHECK-LABEL: test_mm256_broadcastsd_pd
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm256_broadcastsd_pd(a);
+}
+
+__m256i test_mm256_broadcastsi128_si256(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastsi128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  return _mm256_broadcastsi128_si256(a);
+}
+
+__m128 test_mm_broadcastss_ps(__m128 a) {
+  // CHECK-LABEL: test_mm_broadcastss_ps
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  return _mm_broadcastss_ps(a);
+}
+
+__m256 test_mm256_broadcastss_ps(__m128 a) {
+  // CHECK-LABEL: test_mm256_broadcastss_ps
+  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm256_broadcastss_ps(a);
+}
+
+__m128i test_mm_broadcastw_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm_broadcastw_epi16
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  return _mm_broadcastw_epi16(a);
+}
+
+__m256i test_mm256_broadcastw_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_broadcastw_epi16
+  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  return _mm256_broadcastw_epi16(a);
+}
+
+__m256i test_mm256_bslli_epi128(__m256i a) {
+  // CHECK-LABEL: test_mm256_bslli_epi128
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  return _mm256_bslli_epi128(a, 3);
+}
+
+__m256i test_mm256_bsrli_epi128(__m256i a) {
+  // CHECK-LABEL: test_mm256_bsrli_epi128
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  return _mm256_bsrli_epi128(a, 3);
 }
 
 __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi8
   // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi16
   // CHECK: icmp eq <16 x i16>
   return _mm256_cmpeq_epi16(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi32
   // CHECK: icmp eq <8 x i32>
   return _mm256_cmpeq_epi32(a, b);
 }
 
 __m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpeq_epi64
   // CHECK: icmp eq <4 x i64>
   return _mm256_cmpeq_epi64(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi8
   // CHECK: icmp sgt <32 x i8>
   return _mm256_cmpgt_epi8(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi16
   // CHECK: icmp sgt <16 x i16>
   return _mm256_cmpgt_epi16(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi32
   // CHECK: icmp sgt <8 x i32>
   return _mm256_cmpgt_epi32(a, b);
 }
 
 __m256i test_mm256_cmpgt_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_cmpgt_epi64
   // CHECK: icmp sgt <4 x i64>
   return _mm256_cmpgt_epi64(a, b);
 }
 
-__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.w
-  return _mm256_hadd_epi16(a, b);
-}
-
-__m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.d
-  return _mm256_hadd_epi32(a, b);
+__m256i test_mm256_cvtepi8_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi16
+  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
+  return _mm256_cvtepi8_epi16(a);
 }
 
-__m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phadd.sw
-  return _mm256_hadds_epi16(a, b);
+__m256i test_mm256_cvtepi8_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi32
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
+  return _mm256_cvtepi8_epi32(a);
 }
 
-__m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.w
-  return _mm256_hsub_epi16(a, b);
+__m256i test_mm256_cvtepi8_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi8_epi64
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi8_epi64(a);
 }
 
-__m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.d
-  return _mm256_hsub_epi32(a, b);
+__m256i test_mm256_cvtepi16_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi16_epi32
+  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
+  return _mm256_cvtepi16_epi32(a);
 }
 
-__m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.phsub.sw
-  return _mm256_hsubs_epi16(a, b);
+__m256i test_mm256_cvtepi16_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi16_epi64
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi16_epi64(a);
 }
 
-__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmadd.ub.sw
-  return _mm256_maddubs_epi16(a, b);
+__m256i test_mm256_cvtepi32_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepi32_epi64
+  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
+  return _mm256_cvtepi32_epi64(a);
 }
 
-__m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmadd.wd
-  return _mm256_madd_epi16(a, b);
+__m256i test_mm256_cvtepu8_epi16(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi16
+  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
+  return _mm256_cvtepu8_epi16(a);
 }
 
-__m256i test_mm256_max_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.b
-  return _mm256_max_epi8(a, b);
+__m256i test_mm256_cvtepu8_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi32
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
+  return _mm256_cvtepu8_epi32(a);
 }
 
-__m256i test_mm256_max_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.w
-  return _mm256_max_epi16(a, b);
+__m256i test_mm256_cvtepu8_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu8_epi64
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu8_epi64(a);
 }
 
-__m256i test_mm256_max_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxs.d
-  return _mm256_max_epi32(a, b);
+__m256i test_mm256_cvtepu16_epi32(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu16_epi32
+  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
+  return _mm256_cvtepu16_epi32(a);
 }
 
-__m256i test_mm256_max_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.b
-  return _mm256_max_epu8(a, b);
+__m256i test_mm256_cvtepu16_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu16_epi64
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu16_epi64(a);
 }
 
-__m256i test_mm256_max_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.w
-  return _mm256_max_epu16(a, b);
+__m256i test_mm256_cvtepu32_epi64(__m128i a) {
+  // CHECK-LABEL: test_mm256_cvtepu32_epi64
+  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
+  return _mm256_cvtepu32_epi64(a);
 }
 
-__m256i test_mm256_max_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmaxu.d
-  return _mm256_max_epu32(a, b);
+__m128i test0_mm256_extracti128_si256_0(__m256i a) {
+  // CHECK-LABEL: test0_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  return _mm256_extracti128_si256(a, 0);
 }
 
-__m256i test_mm256_min_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.b
-  return _mm256_min_epi8(a, b);
+__m128i test1_mm256_extracti128_si256_1(__m256i a) {
+  // CHECK-LABEL: test1_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+  return _mm256_extracti128_si256(a, 1);
 }
 
-__m256i test_mm256_min_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.w
-  return _mm256_min_epi16(a, b);
+// Immediate should be truncated to one bit.
+__m128i test2_mm256_extracti128_si256(__m256i a) {
+  // CHECK-LABEL: test2_mm256_extracti128_si256
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
+  return _mm256_extracti128_si256(a, 2);
 }
 
-__m256i test_mm256_min_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmins.d
-  return _mm256_min_epi32(a, b);
+__m256i test_mm256_hadd_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadd_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hadd_epi16(a, b);
 }
 
-__m256i test_mm256_min_epu8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.b
-  return _mm256_min_epu8(a, b);
+__m256i test_mm256_hadd_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadd_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_hadd_epi32(a, b);
 }
 
-__m256i test_mm256_min_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.w
-  return _mm256_min_epu16(a, b);
+__m256i test_mm256_hadds_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hadds_epi16
+  // CHECK:call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hadds_epi16(a, b);
 }
 
-__m256i test_mm256_min_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pminu.d
-  return _mm256_min_epu32(a, b);
+__m256i test_mm256_hsub_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsub_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hsub_epi16(a, b);
 }
 
-int test_mm256_movemask_epi8(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pmovmskb
-  return _mm256_movemask_epi8(a);
+__m256i test_mm256_hsub_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsub_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_hsub_epi32(a, b);
 }
 
-__m256i test_mm256_cvtepi8_epi16(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbw
-  return _mm256_cvtepi8_epi16(a);
+__m256i test_mm256_hsubs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_hsubs_epi16
+  // CHECK:call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_hsubs_epi16(a, b);
 }
 
-__m256i test_mm256_cvtepi8_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbd
-  return _mm256_cvtepi8_epi32(a);
+__m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_i32gather_epi32(b, c, 2);
 }
 
-__m256i test_mm256_cvtepi8_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxbq
-  return _mm256_cvtepi8_epi64(a);
+__m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_epi32(a, b, c, d, 2);
 }
 
-__m256i test_mm256_cvtepi16_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxwd
-  return _mm256_cvtepi16_epi32(a);
+__m256i test_mm256_i32gather_epi32(int const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i32gather_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
+  return _mm256_i32gather_epi32(b, c, 2);
 }
 
-__m256i test_mm256_cvtepi16_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxwq
-  return _mm256_cvtepi16_epi64(a);
+__m256i test_mm256_mask_i32gather_epi32(__m256i a, int const *b, __m256i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_epi32(a, b, c, d, 2);
 }
 
-__m256i test_mm256_cvtepi32_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovsxdq
-  return _mm256_cvtepi32_epi64(a);
+__m128i test_mm_i32gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_i32gather_epi64(b, c, 2);
 }
 
-__m256i test_mm256_cvtepu8_epi16(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbw
-  return _mm256_cvtepu8_epi16(a);
+__m128i test_mm_mask_i32gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_epi64(a, b, c, d, 2);
 }
 
-__m256i test_mm256_cvtepu8_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbd
-  return _mm256_cvtepu8_epi32(a);
+__m256i test_mm256_i32gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm256_i32gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_i32gather_epi64(b, c, 2);
 }
 
-__m256i test_mm256_cvtepu8_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxbq
-  return _mm256_cvtepu8_epi64(a);
+__m256i test_mm256_mask_i32gather_epi64(__m256i a, long long const *b, __m128i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_epi64(a, b, c, d, 2);
 }
 
-__m256i test_mm256_cvtepu16_epi32(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxwd
-  return _mm256_cvtepu16_epi32(a);
+__m128d test_mm_i32gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_i32gather_pd(b, c, 2);
 }
 
-__m256i test_mm256_cvtepu16_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxwq
-  return _mm256_cvtepu16_epi64(a);
+__m128d test_mm_mask_i32gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_pd
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_pd(a, b, c, d, 2);
 }
 
-__m256i test_mm256_cvtepu32_epi64(__m128i a) {
-  // CHECK: @llvm.x86.avx2.pmovzxdq
-  return _mm256_cvtepu32_epi64(a);
+__m256d test_mm256_i32gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm256_i32gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_i32gather_pd(b, c, 2);
 }
 
-__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmul.dq
-  return _mm256_mul_epi32(a, b);
+__m256d test_mm256_mask_i32gather_pd(__m256d a, double const *b, __m128i c, __m256d d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_pd
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_pd(a, b, c, d, 2);
 }
 
-__m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmul.hr.sw
-  return _mm256_mulhrs_epi16(a, b);
+__m128 test_mm_i32gather_ps(float const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i32gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_i32gather_ps(b, c, 2);
 }
 
-__m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulhu.w
-  return _mm256_mulhi_epu16(a, b);
+__m128 test_mm_mask_i32gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
+  // CHECK-LABEL: test_mm_mask_i32gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %{{.*}}, i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_mask_i32gather_ps(a, b, c, d, 2);
 }
 
-__m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulh.w
-  return _mm256_mulhi_epi16(a, b);
+__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i32gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <8 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <8 x i1> [[CMP]] to <8 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <8 x i32> [[SEXT]] to <8 x float>
+  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
+  return _mm256_i32gather_ps(b, c, 2);
 }
 
-__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
-  // CHECK: mul <16 x i16>
-  return _mm256_mullo_epi16(a, b);
+__m256 test_mm256_mask_i32gather_ps(__m256 a, float const *b, __m256i c, __m256 d) {
+  // CHECK-LABEL: test_mm256_mask_i32gather_ps
+  // CHECK: call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %{{.*}}, i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x float> %{{.*}}, i8 2)
+  return _mm256_mask_i32gather_ps(a, b, c, d, 2);
 }
 
-__m256i test_mm256_mullo_epi32(__m256i a, __m256i b) {
-  // CHECK: mul <8 x i32>
-  return _mm256_mullo_epi32(a, b);
+__m128i test_mm_i64gather_epi32(int const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_i64gather_epi32(b, c, 2);
 }
 
-__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pmulu.dq
-  return _mm256_mul_epu32(a, b);
+__m128i test_mm_mask_i64gather_epi32(__m128i a, int const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_epi32(a, b, c, d, 2);
 }
 
-__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.pshuf.b
-  return _mm256_shuffle_epi8(a, b);
+__m128i test_mm256_i64gather_epi32(int const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm256_i64gather_epi32(b, c, 2);
 }
 
-__m256i test_mm256_shuffle_epi32(__m256i a) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
-  return _mm256_shuffle_epi32(a, 15);
+__m128i test_mm256_mask_i64gather_epi32(__m128i a, int const *b, __m256i c, __m128i d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_epi32(a, b, c, d, 2);
 }
 
-__m256i test_mm256_shufflehi_epi16(__m256i a) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
-  return _mm256_shufflehi_epi16(a, 107);
+__m128i test_mm_i64gather_epi64(long long const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_i64gather_epi64(b, c, 2);
 }
 
-__m256i test_mm256_shufflelo_epi16(__m256i a) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
-  return _mm256_shufflelo_epi16(a, 83);
+__m128i test_mm_mask_i64gather_epi64(__m128i a, long long const *b, __m128i c, __m128i d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_epi64(a, b, c, d, 2);
 }
 
-__m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.b
-  return _mm256_sign_epi8(a, b);
+__m256i test_mm256_i64gather_epi64(long long const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_i64gather_epi64(b, c, 2);
 }
 
-__m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.w
-  return _mm256_sign_epi16(a, b);
+__m256i test_mm256_mask_i64gather_epi64(__m256i a, long long const *b, __m256i c, __m256i d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_epi64(a, b, c, d, 2);
 }
 
-__m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psign.d
-  return _mm256_sign_epi32(a, b);
+__m128d test_mm_i64gather_pd(double const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_i64gather_pd(b, c, 2);
 }
 
-__m256i test_mm256_slli_si256(__m256i a) {
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
-  return _mm256_slli_si256(a, 3);
+__m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c, __m128d d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_pd
+  // CHECK: call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_pd(a, b, c, d, 2);
 }
 
-__m256i test_mm256_bslli_epi128(__m256i a) {
-  // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
-  return _mm256_bslli_epi128(a, 3);
+__m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_pd
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i64> [[SEXT]] to <4 x double>
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_i64gather_pd(b, c, 2);
 }
 
-__m256i test_mm256_slli_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.w
-  return _mm256_slli_epi16(a, 3);
+__m256d test_mm256_mask_i64gather_pd(__m256d a, double const *b, __m256i c, __m256d d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_pd
+  // CHECK: call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x double> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_pd(a, b, c, d, 2);
 }
 
-__m256i test_mm256_sll_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.w
-  return _mm256_sll_epi16(a, b);
+__m128 test_mm_i64gather_ps(float const *b, __m128i c) {
+  // CHECK-LABEL: test_mm_i64gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_i64gather_ps(b, c, 2);
 }
 
-__m256i test_mm256_slli_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.d
-  return _mm256_slli_epi32(a, 3);
+__m128 test_mm_mask_i64gather_ps(__m128 a, float const *b, __m128i c, __m128 d) {
+  // CHECK-LABEL: test_mm_mask_i64gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %{{.*}}, i8* %{{.*}}, <2 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_mask_i64gather_ps(a, b, c, d, 2);
 }
 
-__m256i test_mm256_sll_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.d
-  return _mm256_sll_epi32(a, b);
+__m128 test_mm256_i64gather_ps(float const *b, __m256i c) {
+  // CHECK-LABEL: test_mm256_i64gather_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm256_i64gather_ps(b, c, 2);
 }
 
-__m256i test_mm256_slli_epi64(__m256i a) {
-  // CHECK: @llvm.x86.avx2.pslli.q
-  return _mm256_slli_epi64(a, 3);
+__m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c, __m128 d) {
+  // CHECK-LABEL: test_mm256_mask_i64gather_ps
+  // CHECK: call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %{{.*}}, i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm256_mask_i64gather_ps(a, b, c, d, 2);
 }
 
-__m256i test_mm256_sll_epi64(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psll.q
-  return _mm256_sll_epi64(a, b);
+__m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test0_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_inserti128_si256(a, b, 0);
 }
 
-__m256i test_mm256_srai_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrai.w
-  return _mm256_srai_epi16(a, 3);
+__m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test1_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm256_inserti128_si256(a, b, 1);
 }
 
-__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psra.w
-  return _mm256_sra_epi16(a, b);
+// Immediate should be truncated to one bit.
+__m256i test2_mm256_inserti128_si256(__m256i a, __m128i b) {
+  // CHECK-LABEL: test2_mm256_inserti128_si256
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm256_inserti128_si256(a, b, 2);
 }
 
-__m256i test_mm256_srai_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrai.d
-  return _mm256_srai_epi32(a, 3);
+__m256i test_mm256_madd_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_madd_epi16
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_madd_epi16(a, b);
 }
 
-__m256i test_mm256_sra_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psra.d
-  return _mm256_sra_epi32(a, b);
+__m256i test_mm256_maddubs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_maddubs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_maddubs_epi16(a, b);
 }
 
-__m256i test_mm256_srli_si256(__m256i a) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
-  return _mm256_srli_si256(a, 3);
+__m128i test_mm_maskload_epi32(int const *a, __m128i m) {
+  // CHECK-LABEL: test_mm_maskload_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskload_epi32(a, m);
 }
 
-__m256i test_mm256_bsrli_epi128(__m256i a) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
-  return _mm256_bsrli_epi128(a, 3);
+__m256i test_mm256_maskload_epi32(int const *a, __m256i m) {
+  // CHECK-LABEL: test_mm256_maskload_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskload_epi32(a, m);
 }
 
-__m256i test_mm256_srli_epi16(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.w
-  return _mm256_srli_epi16(a, 3);
+__m128i test_mm_maskload_epi64(long long const *a, __m128i m) {
+  // CHECK-LABEL: test_mm_maskload_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskload_epi64(a, m);
 }
 
-__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.w
-  return _mm256_srl_epi16(a, b);
+__m256i test_mm256_maskload_epi64(long long const *a, __m256i m) {
+  // CHECK-LABEL: test_mm256_maskload_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskload_epi64(a, m);
 }
 
-__m256i test_mm256_srli_epi32(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.d
-  return _mm256_srli_epi32(a, 3);
+void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
+  // CHECK-LABEL: test_mm_maskstore_epi32
+  // CHECK: call void @llvm.x86.avx2.maskstore.d(i8* %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  _mm_maskstore_epi32(a, m, b);
 }
 
-__m256i test_mm256_srl_epi32(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.d
-  return _mm256_srl_epi32(a, b);
+void test_mm256_maskstore_epi32(int *a, __m256i m, __m256i b) {
+  // CHECK-LABEL: test_mm256_maskstore_epi32
+  // CHECK: call void @llvm.x86.avx2.maskstore.d.256(i8* %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  _mm256_maskstore_epi32(a, m, b);
 }
 
-__m256i test_mm256_srli_epi64(__m256i a) {
-  // CHECK: @llvm.x86.avx2.psrli.q
-  return _mm256_srli_epi64(a, 3);
+void test_mm_maskstore_epi64(long long *a, __m128i m, __m128i b) {
+  // CHECK-LABEL: test_mm_maskstore_epi64
+  // CHECK: call void @llvm.x86.avx2.maskstore.q(i8* %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  _mm_maskstore_epi64(a, m, b);
 }
 
-__m256i test_mm256_srl_epi64(__m256i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrl.q
-  return _mm256_srl_epi64(a, b);
+void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
+  // CHECK-LABEL: test_mm256_maskstore_epi64
+  // CHECK: call void @llvm.x86.avx2.maskstore.q.256(i8* %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  _mm256_maskstore_epi64(a, m, b);
 }
 
-__m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  return _mm256_unpackhi_epi8(a, b);
+__m256i test_mm256_max_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi8
+  // CHECK:       [[CMP:%.*]] = icmp sgt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_max_epi8(a, b);
 }
 
-__m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  return _mm256_unpackhi_epi16(a, b);
+__m256i test_mm256_max_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi16
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_max_epi16(a, b);
 }
 
-__m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
-  return _mm256_unpackhi_epi32(a, b);
+__m256i test_mm256_max_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epi32
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_max_epi32(a, b);
 }
 
-__m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-  return _mm256_unpackhi_epi64(a, b);
+__m256i test_mm256_max_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu8
+  // CHECK:       [[CMP:%.*]] = icmp ugt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_max_epu8(a, b);
 }
 
-__m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
-  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
-  return _mm256_unpacklo_epi8(a, b);
+__m256i test_mm256_max_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_max_epu16(a, b);
 }
 
-__m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
-  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
-  return _mm256_unpacklo_epi16(a, b);
+__m256i test_mm256_max_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_max_epu32
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_max_epu32(a, b);
 }
 
-__m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
-  return _mm256_unpacklo_epi32(a, b);
+__m256i test_mm256_min_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi8
+  // CHECK:       [[CMP:%.*]] = icmp slt <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_min_epi8(a, b);
 }
 
-__m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-  return _mm256_unpacklo_epi64(a, b);
+__m256i test_mm256_min_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi16
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_min_epi16(a, b);
 }
 
-__m256i test_mm256_stream_load_si256(__m256i const *a) {
-  // CHECK: @llvm.x86.avx2.movntdqa
-  return _mm256_stream_load_si256(a);
+__m256i test_mm256_min_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epi32
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_min_epi32(a, b);
 }
 
-__m128 test_mm_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm_broadcastss_ps(a);
+__m256i test_mm256_min_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu8
+  // CHECK:       [[CMP:%.*]] = icmp ult <32 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <32 x i1> [[CMP]], <32 x i8> [[X]], <32 x i8> [[Y]]
+  return _mm256_min_epu8(a, b);
 }
 
-__m128d test_mm_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm_broadcastsd_pd
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
-  return _mm_broadcastsd_pd(a);
+__m256i test_mm256_min_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i16> [[X]], <16 x i16> [[Y]]
+  return _mm256_min_epu16(a, b);
 }
 
-__m256 test_mm256_broadcastss_ps(__m128 a) {
-  // CHECK-LABEL: test_mm256_broadcastss_ps
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.ss.ps.256
-  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm256_broadcastss_ps(a);
+__m256i test_mm256_min_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_min_epu32
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i32> [[X]], <8 x i32> [[Y]]
+  return _mm256_min_epu32(a, b);
 }
 
-__m256d test_mm256_broadcastsd_pd(__m128d a) {
-  // CHECK-LABEL: test_mm256_broadcastsd_pd
-  // CHECK-NOT: @llvm.x86.avx2.vbroadcast.sd.pd.256
-  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm256_broadcastsd_pd(a);
+int test_mm256_movemask_epi8(__m256i a) {
+  // CHECK-LABEL: test_mm256_movemask_epi8
+  // CHECK: call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
+  return _mm256_movemask_epi8(a);
 }
 
-__m256i test_mm256_broadcastsi128_si256(__m128i a) {
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-  return _mm256_broadcastsi128_si256(a);
+__m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
+  // CHECK-LABEL: test_mm256_mpsadbw_epu8
+  // CHECK: call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, i8 3)
+  return _mm256_mpsadbw_epu8(x, y, 3);
 }
 
-__m128i test_mm_blend_epi32(__m128i a, __m128i b) {
-  // CHECK-LABEL: test_mm_blend_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pblendd.128
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
-  return _mm_blend_epi32(a, b, 0x35);
+__m256i test_mm256_mul_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mul_epi32
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mul_epi32(a, b);
 }
 
-__m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
-  // CHECK-LABEL: test_mm256_blend_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pblendd.256
-  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
-  return _mm256_blend_epi32(a, b, 0x35);
+__m256i test_mm256_mul_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mul_epu32
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mul_epu32(a, b);
 }
 
-__m256i test_mm256_broadcastb_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastb_epi8
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.256
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
-  return _mm256_broadcastb_epi8(a);
+__m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhi_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhi_epu16(a, b);
 }
 
-__m256i test_mm256_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.256
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
-  return _mm256_broadcastw_epi16(a);
+__m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhi_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhi_epi16(a, b);
 }
 
-__m256i test_mm256_broadcastd_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastd_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.256
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm256_broadcastd_epi32(a);
+__m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mulhrs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mulhrs_epi16(a, b);
 }
 
-__m256i test_mm256_broadcastq_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm256_broadcastq_epi64
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.256
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm256_broadcastq_epi64(a);
+__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mullo_epi16
+  // CHECK: mul <16 x i16>
+  return _mm256_mullo_epi16(a, b);
 }
 
-__m128i test_mm_broadcastb_epi8(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastb_epi8
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastb.128
-  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
-  return _mm_broadcastb_epi8(a);
+__m256i test_mm256_mullo_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_mullo_epi32
+  // CHECK: mul <8 x i32>
+  return _mm256_mullo_epi32(a, b);
 }
 
-__m128i test_mm_broadcastw_epi16(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastw_epi16
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastw.128
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
-  return _mm_broadcastw_epi16(a);
+__m256i test_mm256_or_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_or_si256
+  // CHECK: or <4 x i64>
+  return _mm256_or_si256(a, b);
 }
 
-__m128i test_mm_broadcastd_epi32(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastd_epi32
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastd.128
-  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
-  return _mm_broadcastd_epi32(a);
+__m256i test_mm256_packs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epi16
+  // CHECK: call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_packs_epi16(a, b);
 }
 
-__m128i test_mm_broadcastq_epi64(__m128i a) {
-  // CHECK-LABEL: test_mm_broadcastq_epi64
-  // CHECK-NOT: @llvm.x86.avx2.pbroadcastq.128
-  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
-  return _mm_broadcastq_epi64(a);
+__m256i test_mm256_packs_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epi32
+  // CHECK: call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_packs_epi32(a, b);
 }
 
-__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.permd
-  return _mm256_permutevar8x32_epi32(a, b);
+__m256i test_mm256_packs_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epu16
+  // CHECK:  call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_packus_epi16(a, b);
 }
 
-__m256d test_mm256_permute4x64_pd(__m256d a) {
-  // CHECK: shufflevector{{.*}}<i32 1, i32 2, i32 1, i32 0>
-  return _mm256_permute4x64_pd(a, 25);
+__m256i test_mm256_packs_epu32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_packs_epu32
+  // CHECK: call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_packus_epi32(a, b);
 }
 
-__m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.permps
-  return _mm256_permutevar8x32_ps(a, b);
+__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permute2x128_si256
+  // CHECK: call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, i8 49)
+  return _mm256_permute2x128_si256(a, b, 0x31);
 }
 
 __m256i test_mm256_permute4x64_epi64(__m256i a) {
-  // CHECK: shufflevector{{.*}}<i32 3, i32 0, i32 2, i32 0>
+  // CHECK-LABEL: test_mm256_permute4x64_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
   return _mm256_permute4x64_epi64(a, 35);
 }
 
-__m256i test_mm256_permute2x128_si256(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.vperm2i128
-  return _mm256_permute2x128_si256(a, b, 0x31);
-}
-
-__m128i test_mm256_extracti128_si256_0(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
-  return _mm256_extracti128_si256(a, 0);
+__m256d test_mm256_permute4x64_pd(__m256d a) {
+  // CHECK-LABEL: test_mm256_permute4x64_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
+  return _mm256_permute4x64_pd(a, 25);
 }
 
-__m128i test_mm256_extracti128_si256_1(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 2, i32 3>
-  return _mm256_extracti128_si256(a, 1);
+__m256i test_mm256_permutevar8x32_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permutevar8x32_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar8x32_epi32(a, b);
 }
 
-// Immediate should be truncated to one bit.
-__m128i test_mm256_extracti128_si256_2(__m256i a) {
-  // CHECK-LABEL: @test_mm256_extracti128_si256_2
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1>
-  return _mm256_extracti128_si256(a, 2);
+__m256 test_mm256_permutevar8x32_ps(__m256 a, __m256i b) {
+  // CHECK-LABEL: test_mm256_permutevar8x32_ps
+  // CHECK: call <8 x float> @llvm.x86.avx2.permps(<8 x float> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_permutevar8x32_ps(a, b);
 }
 
-__m256i test_mm256_inserti128_si256_0(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_0
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
-  return _mm256_inserti128_si256(a, b, 0);
+__m256i test_mm256_sad_epu8(__m256i x, __m256i y) {
+  // CHECK-LABEL: test_mm256_sad_epu8
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_sad_epu8(x, y);
 }
 
-__m256i test_mm256_inserti128_si256_1(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_1
-  // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5>
-  return _mm256_inserti128_si256(a, b, 1);
+__m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_shuffle_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_shuffle_epi8(a, b);
 }
 
-// Immediate should be truncated to one bit.
-__m256i test_mm256_inserti128_si256_2(__m256i a, __m128i b) {
-  // CHECK-LABEL: @test_mm256_inserti128_si256_2
-  // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3>
-  return _mm256_inserti128_si256(a, b, 2);
+__m256i test_mm256_shuffle_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
+  return _mm256_shuffle_epi32(a, 15);
 }
 
-__m256i test_mm256_maskload_epi32(int const *a, __m256i m) {
-  // CHECK: @llvm.x86.avx2.maskload.d.256
-  return _mm256_maskload_epi32(a, m);
+__m256i test_mm256_shufflehi_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
+  return _mm256_shufflehi_epi16(a, 107);
 }
 
-__m256i test_mm256_maskload_epi64(long long const *a, __m256i m) {
-  // CHECK: @llvm.x86.avx2.maskload.q.256
-  return _mm256_maskload_epi64(a, m);
+__m256i test_mm256_shufflelo_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
+  return _mm256_shufflelo_epi16(a, 83);
 }
 
-__m128i test_mm_maskload_epi32(int const *a, __m128i m) {
-  // CHECK: @llvm.x86.avx2.maskload.d
-  return _mm_maskload_epi32(a, m);
+__m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_sign_epi8(a, b);
 }
 
-__m128i test_mm_maskload_epi64(long long const *a, __m128i m) {
-  // CHECK: @llvm.x86.avx2.maskload.q
-  return _mm_maskload_epi64(a, m);
+__m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_sign_epi16(a, b);
 }
 
-void test_mm256_maskstore_epi32(int *a, __m256i m, __m256i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.d.256
-  _mm256_maskstore_epi32(a, m, b);
+__m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sign_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sign_epi32(a, b);
 }
 
-void test_mm256_maskstore_epi64(long long *a, __m256i m, __m256i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.q.256
-  _mm256_maskstore_epi64(a, m, b);
+__m256i test_mm256_slli_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi16(a, 3);
 }
 
-void test_mm_maskstore_epi32(int *a, __m128i m, __m128i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.d
-  _mm_maskstore_epi32(a, m, b);
+__m256i test_mm256_slli_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi32(a, 3);
 }
 
-void test_mm_maskstore_epi64(long long *a, __m128i m, __m128i b) {
-  // CHECK: @llvm.x86.avx2.maskstore.q
-  _mm_maskstore_epi64(a, m, b);
+__m256i test_mm256_slli_epi64(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
+  return _mm256_slli_epi64(a, 3);
 }
 
-__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psllv.d.256
-  return _mm256_sllv_epi32(a, b);
+__m256i test_mm256_slli_si256(__m256i a) {
+  // CHECK-LABEL: test_mm256_slli_si256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
+  return _mm256_slli_si256(a, 3);
 }
 
 __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psllv.d
+  // CHECK-LABEL: test_mm_sllv_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sllv_epi32(a, b);
 }
 
-__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psllv.q.256
-  return _mm256_sllv_epi64(a, b);
+__m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sllv_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_sllv_epi32(a, b);
 }
 
 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psllv.q
+  // CHECK-LABEL: test_mm_sllv_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sllv_epi64(a, b);
 }
 
-__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrav.d.256
-  return _mm256_srav_epi32(a, b);
-}
-
-__m128i test_mm_srav_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrav.d
-  return _mm_srav_epi32(a, b);
-}
-
-__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.d.256
-  return _mm256_srlv_epi32(a, b);
+__m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sllv_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_sllv_epi64(a, b);
 }
 
-__m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.d
-  return _mm_srlv_epi32(a, b);
+__m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_sra_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm256_sra_epi16(a, b);
 }
 
-__m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.q.256
-  return _mm256_srlv_epi64(a, b);
+__m256i test_mm256_sra_epi32(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_sra_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm256_sra_epi32(a, b);
 }
 
-__m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
-  // CHECK: @llvm.x86.avx2.psrlv.q
-  return _mm_srlv_epi64(a, b);
+__m256i test_mm256_srai_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_srai_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_srai_epi16(a, 3);
 }
 
-__m128d test_mm_mask_i32gather_pd(__m128d a, double const *b, __m128i c,
-                                  __m128d d) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd
-  return _mm_mask_i32gather_pd(a, b, c, d, 2);
+__m256i test_mm256_srai_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_srai_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_srai_epi32(a, 3);
 }
 
-__m256d test_mm256_mask_i32gather_pd(__m256d a, double const *b, __m128i c,
-                                      __m256d d) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd.256
-  return _mm256_mask_i32gather_pd(a, b, c, d, 2);
+__m128i test_mm_srav_epi32(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_srav_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_srav_epi32(a, b);
 }
 
-__m128d test_mm_mask_i64gather_pd(__m128d a, double const *b, __m128i c,
-                                  __m128d d) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd
-  return _mm_mask_i64gather_pd(a, b, c, d, 2);
+__m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srav_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_srav_epi32(a, b);
 }
 
-__m256d test_mm256_mask_i64gather_pd(__m256d a, double const *b, __m256i c,
-                                      __m256d d) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd.256
-  return _mm256_mask_i64gather_pd(a, b, c, d, 2);
+__m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm256_srl_epi16(a, b);
 }
 
-__m128 test_mm_mask_i32gather_ps(__m128 a, float const *b, __m128i c,
-                                 __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps
-  return _mm_mask_i32gather_ps(a, b, c, d, 2);
+__m256i test_mm256_srl_epi32(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi32
+  // CHECK:call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm256_srl_epi32(a, b);
 }
 
-__m256 test_mm256_mask_i32gather_ps(__m256 a, float const *b, __m256i c,
-                                     __m256 d) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps.256
-  return _mm256_mask_i32gather_ps(a, b, c, d, 2);
+__m256i test_mm256_srl_epi64(__m256i a, __m128i b) {
+  // CHECK-LABEL: test_mm256_srl_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm256_srl_epi64(a, b);
 }
 
-__m128 test_mm_mask_i64gather_ps(__m128 a, float const *b, __m128i c,
-                                 __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps
-  return _mm_mask_i64gather_ps(a, b, c, d, 2);
+__m256i test_mm256_srli_epi16(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi16(a, 3);
 }
 
-__m128 test_mm256_mask_i64gather_ps(__m128 a, float const *b, __m256i c,
-                                    __m128 d) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps.256
-  return _mm256_mask_i64gather_ps(a, b, c, d, 2);
+__m256i test_mm256_srli_epi32(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi32(a, 3);
 }
 
-__m128i test_mm_mask_i32gather_epi32(__m128i a, int const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.d
-  return _mm_mask_i32gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_srli_epi64(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %{{.*}}, i32 %{{.*}})
+  return _mm256_srli_epi64(a, 3);
 }
 
-__m256i test_mm256_mask_i32gather_epi32(__m256i a, int const *b, __m256i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.d.256
-  return _mm256_mask_i32gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_srli_si256(__m256i a) {
+  // CHECK-LABEL: test_mm256_srli_si256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
+  return _mm256_srli_si256(a, 3);
 }
 
-__m128i test_mm_mask_i64gather_epi32(__m128i a, int const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.d
-  return _mm_mask_i64gather_epi32(a, b, c, d, 2);
+__m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_srlv_epi32
+  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_srlv_epi32(a, b);
 }
 
-__m128i test_mm256_mask_i64gather_epi32(__m128i a, int const *b, __m256i c,
-                                        __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.d.256
-  return _mm256_mask_i64gather_epi32(a, b, c, d, 2);
+__m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srlv_epi32
+  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_srlv_epi32(a, b);
 }
 
-__m128i test_mm_mask_i32gather_epi64(__m128i a, long long const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.q
-  return _mm_mask_i32gather_epi64(a, b, c, d, 2);
+__m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
+  // CHECK-LABEL: test_mm_srlv_epi64
+  // CHECK: call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_srlv_epi64(a, b);
 }
 
-__m256i test_mm256_mask_i32gather_epi64(__m256i a, long long const *b, __m128i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.d.q.256
-  return _mm256_mask_i32gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_srlv_epi64
+  // CHECK: call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_srlv_epi64(a, b);
 }
 
-__m128i test_mm_mask_i64gather_epi64(__m128i a, long long const *b, __m128i c,
-                                     __m128i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.q
-  return _mm_mask_i64gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_stream_load_si256(__m256i const *a) {
+  // CHECK-LABEL: test_mm256_stream_load_si256
+  // CHECK: call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %{{.*}})
+  return _mm256_stream_load_si256(a);
 }
 
-__m256i test_mm256_mask_i64gather_epi64(__m256i a, long long const *b, __m256i c,
-                                        __m256i d) {
-  // CHECK: @llvm.x86.avx2.gather.q.q.256
-  return _mm256_mask_i64gather_epi64(a, b, c, d, 2);
+__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi8
+  // CHECK: sub <32 x i8>
+  return _mm256_sub_epi8(a, b);
 }
 
-__m128d test_mm_i32gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd
-  return _mm_i32gather_pd(b, c, 2);
+__m256i test_mm256_sub_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi16
+  // CHECK: sub <16 x i16>
+  return _mm256_sub_epi16(a, b);
 }
 
-__m256d test_mm256_i32gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.pd.256
-  return _mm256_i32gather_pd(b, c, 2);
+__m256i test_mm256_sub_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi32
+  // CHECK: sub <8 x i32>
+  return _mm256_sub_epi32(a, b);
 }
 
-__m128d test_mm_i64gather_pd(double const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd
-  return _mm_i64gather_pd(b, c, 2);
+__m256i test_mm256_sub_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_sub_epi64
+  // CHECK: sub <4 x i64>
+  return _mm256_sub_epi64(a, b);
 }
 
-__m256d test_mm256_i64gather_pd(double const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.pd.256
-  return _mm256_i64gather_pd(b, c, 2);
+__m256i test_mm256_subs_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epi8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_subs_epi8(a, b);
 }
 
-__m128 test_mm_i32gather_ps(float const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps
-  return _mm_i32gather_ps(b, c, 2);
+__m256i test_mm256_subs_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epi16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_subs_epi16(a, b);
 }
 
-__m256 test_mm256_i32gather_ps(float const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.ps.256
-  return _mm256_i32gather_ps(b, c, 2);
+__m256i test_mm256_subs_epu8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epu8
+  // CHECK: call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_subs_epu8(a, b);
 }
 
-__m128 test_mm_i64gather_ps(float const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps
-  return _mm_i64gather_ps(b, c, 2);
+__m256i test_mm256_subs_epu16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_subs_epu16
+  // CHECK: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_subs_epu16(a, b);
 }
 
-__m128 test_mm256_i64gather_ps(float const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.ps.256
-  return _mm256_i64gather_ps(b, c, 2);
+__m256i test_mm256_unpackhi_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  return _mm256_unpackhi_epi8(a, b);
 }
 
-__m128i test_mm_i32gather_epi32(int const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.d
-  return _mm_i32gather_epi32(b, c, 2);
+__m256i test_mm256_unpackhi_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  return _mm256_unpackhi_epi16(a, b);
 }
 
-__m256i test_mm256_i32gather_epi32(int const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.d.256
-  return _mm256_i32gather_epi32(b, c, 2);
+__m256i test_mm256_unpackhi_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  return _mm256_unpackhi_epi32(a, b);
 }
 
-__m128i test_mm_i64gather_epi32(int const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.d
-  return _mm_i64gather_epi32(b, c, 2);
+__m256i test_mm256_unpackhi_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  return _mm256_unpackhi_epi64(a, b);
 }
 
-__m128i test_mm256_i64gather_epi32(int const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.d.256
-  return _mm256_i64gather_epi32(b, c, 2);
+__m256i test_mm256_unpacklo_epi8(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  return _mm256_unpacklo_epi8(a, b);
 }
 
-__m128i test_mm_i32gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.q
-  return _mm_i32gather_epi64(b, c, 2);
+__m256i test_mm256_unpacklo_epi16(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  return _mm256_unpacklo_epi16(a, b);
 }
 
-__m256i test_mm256_i32gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.d.q.256
-  return _mm256_i32gather_epi64(b, c, 2);
+__m256i test_mm256_unpacklo_epi32(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  return _mm256_unpacklo_epi32(a, b);
 }
 
-__m128i test_mm_i64gather_epi64(long long const *b, __m128i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.q
-  return _mm_i64gather_epi64(b, c, 2);
+__m256i test_mm256_unpacklo_epi64(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  return _mm256_unpacklo_epi64(a, b);
 }
 
-__m256i test_mm256_i64gather_epi64(long long const *b, __m256i c) {
-  // CHECK: @llvm.x86.avx2.gather.q.q.256
-  return _mm256_i64gather_epi64(b, c, 2);
+__m256i test_mm256_xor_si256(__m256i a, __m256i b) {
+  // CHECK-LABEL: test_mm256_xor_si256
+  // CHECK: xor <4 x i64>
+  return _mm256_xor_si256(a, b);
 }
diff --git a/test/CodeGen/avx512bw-builtins.c b/test/CodeGen/avx512bw-builtins.c
index 7addd98b112d4..1cd0a0ccb141a 100644
--- a/test/CodeGen/avx512bw-builtins.c
+++ b/test/CodeGen/avx512bw-builtins.c
@@ -8,338 +8,366 @@
 
 __mmask64 test_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.512
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.512
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.512
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.512
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.512
+  // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.512
+  // CHECK: icmp sgt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.512
+  // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.512
+  // CHECK: icmp sgt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 0, i64 -1)
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpeq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 0, i64 {{.*}})
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 0, i32 -1)
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpeq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 0, i32 {{.*}})
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 6, i64 -1)
+  // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpgt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 6, i64 {{.*}})
+  // CHECK: icmp ugt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 6, i32 -1)
+  // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpgt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 6, i32 {{.*}})
+  // CHECK: icmp ugt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 -1)
+  // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 {{.*}})
+  // CHECK: icmp sge <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 -1)
+  // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpge_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 5, i64 {{.*}})
+  // CHECK: icmp uge <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 -1)
+  // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 {{.*}})
+  // CHECK: icmp sge <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 -1)
+  // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpge_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 5, i32 {{.*}})
+  // CHECK: icmp uge <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 -1)
+  // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 {{.*}})
+  // CHECK: icmp sle <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 -1)
+  // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmple_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 2, i64 {{.*}})
+  // CHECK: icmp ule <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 -1)
+  // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 {{.*}})
+  // CHECK: icmp sle <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 -1)
+  // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmple_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 2, i32 {{.*}})
+  // CHECK: icmp ule <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 -1)
+  // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 {{.*}})
+  // CHECK: icmp slt <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 -1)
+  // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmplt_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 1, i64 {{.*}})
+  // CHECK: icmp ult <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 -1)
+  // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 {{.*}})
+  // CHECK: icmp slt <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 -1)
+  // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmplt_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 1, i32 {{.*}})
+  // CHECK: icmp ult <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 -1)
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epi8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 {{.*}})
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 -1)
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_cmpneq_epu8_mask(__a, __b);
 }
 
 __mmask64 test_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 4, i64 {{.*}})
+  // CHECK: icmp ne <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
   return (__mmask64)_mm512_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 -1)
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epi16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 {{.*}})
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 -1)
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_cmpneq_epu16_mask(__a, __b);
 }
 
 __mmask32 test_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 4, i32 {{.*}})
+  // CHECK: icmp ne <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm512_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask64 test_mm512_cmp_epi8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 -1)
-  return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epi8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 {{.*}})
-  return (__mmask64)_mm512_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask64 test_mm512_cmp_epu8_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 -1)
-  return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_cmp_epu8_mask(__a, __b, 0);
 }
 
 __mmask64 test_mm512_mask_cmp_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i32 7, i64 {{.*}})
-  return (__mmask64)_mm512_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <64 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <64 x i1> %{{.*}}, %{{.*}}
+  return (__mmask64)_mm512_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epi16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 -1)
-  return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epi16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask32)_mm512_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm512_cmp_epu16_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 -1)
-  return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_cmp_epu16_mask(__a, __b, 0);
 }
 
 __mmask32 test_mm512_mask_cmp_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> {{.*}}, <32 x i16> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask32)_mm512_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm512_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 __m512i test_mm512_add_epi8 (__m512i __A, __m512i __B) {
@@ -434,12 +462,12 @@ __m512i test_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_blend_epi8(__mmask64 __U, __m512i __A, __m512i __W) {
   // CHECK-LABEL: @test_mm512_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.512
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_blend_epi8(__U,__A,__W); 
 }
 __m512i test_mm512_mask_blend_epi16(__mmask32 __U, __m512i __A, __m512i __W) {
   // CHECK-LABEL: @test_mm512_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.512
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_blend_epi16(__U,__A,__W); 
 }
 __m512i test_mm512_abs_epi8(__m512i __A) {
@@ -971,73 +999,617 @@ __m256i test_mm512_maskz_cvtepi16_epi8(__mmask32 __M, __m512i __A) {
 
 __m512i test_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   return _mm512_unpackhi_epi8(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   return _mm512_unpackhi_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
   return _mm512_unpacklo_epi8(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.512
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119>
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
   return _mm512_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m512i test_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
   return _mm512_unpacklo_epi16(__A, __B); 
 }
 
 __m512i test_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m512i test_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.512
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
   return _mm512_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
+__m512i test_mm512_cvtepi8_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_cvtepi8_epi16(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.512
+  return _mm512_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_cvtepu8_epi16(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.512
+  return _mm512_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m512i test_mm512_shufflehi_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  return _mm512_shufflehi_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_shufflehi_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_shufflehi_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shufflehi_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12, i32 16, i32 17, i32 18, i32 19, i32 21, i32 21, i32 20, i32 20, i32 24, i32 25, i32 26, i32 27, i32 29, i32 29, i32 28, i32 28>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_shufflelo_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  return _mm512_shufflelo_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_shufflelo_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_shufflelo_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shufflelo_epi16
+  // CHECK: shufflevector <32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15, i32 17, i32 17, i32 16, i32 16, i32 20, i32 21, i32 22, i32 23, i32 25, i32 25, i32 24, i32 24, i32 28, i32 29, i32 30, i32 31>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_sllv_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_sllv_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sllv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm512_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_sll_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_sll_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.512
+  return _mm512_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_slli_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_slli_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.512
+  return _mm512_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_bslli_epi128(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_bslli_epi128
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
+  return _mm512_bslli_epi128(__A, 5);
+}
+
+__m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_srlv_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_mask_srlv_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm512_maskz_srlv_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srav_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_srav_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_mask_srav_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm512_maskz_srav_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_sra_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_sra_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_mask_sra_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.w.512
+  return _mm512_maskz_sra_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srai_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_srai_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_mask_srai_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi16
+  // CHECK: @llvm.x86.avx512.mask.psra.wi.512
+  return _mm512_maskz_srai_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_srl_epi16(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_srl_epi16(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_mask_srl_epi16(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.w.512
+  return _mm512_maskz_srl_epi16(__U, __A, __B); 
+}
+
+__m512i test_mm512_srli_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_srli_epi16(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_mask_srli_epi16(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psrl.wi.512
+  return _mm512_maskz_srli_epi16(__U, __A, 5); 
+}
+
+__m512i test_mm512_bsrli_epi128(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_bsrli_epi128
+  // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
+  return _mm512_bsrli_epi128(__A, 5);
+}
+__m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi16
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi16(__mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi16
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_mov_epi16(__U, __A); 
+}
+
+__m512i test_mm512_mask_mov_epi8(__m512i __W, __mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi8
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi8(__mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi8
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_maskz_mov_epi8(__U, __A); 
+}
+
+__m512i test_mm512_mask_set1_epi8(__m512i __O, __mmask64 __M, char __A) {
+  // CHECK-LABEL: @test_mm512_mask_set1_epi8
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.b.gpr.512
+  return _mm512_mask_set1_epi8(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_set1_epi8(__mmask64 __M, char __A) {
+  // CHECK-LABEL: @test_mm512_maskz_set1_epi8
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.b.gpr.512
+  return _mm512_maskz_set1_epi8(__M, __A); 
+}
+
+__mmask64 test_mm512_kunpackd(__mmask64 __A, __mmask64 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackd
+  // CHECK: @llvm.x86.avx512.kunpck.dq
+  return _mm512_kunpackd(__A, __B); 
+}
+
+__mmask32 test_mm512_kunpackw(__mmask32 __A, __mmask32 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackw
+  // CHECK: @llvm.x86.avx512.kunpck.wd
+  return _mm512_kunpackw(__A, __B); 
+}
+
+__m512i test_mm512_mask_loadu_epi16(__m512i __W, __mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi16(__mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i16> %{{.*}})
+  return _mm512_maskz_loadu_epi16(__U, __P); 
+}
+
+__m512i test_mm512_mask_loadu_epi8(__m512i __W, __mmask64 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_loadu_epi8(__mmask64 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}}, <64 x i8> %{{.*}})
+  return _mm512_maskz_loadu_epi8(__U, __P); 
+}
+void test_mm512_mask_storeu_epi16(void *__P, __mmask32 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %{{.*}}, <32 x i16>* %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi16(__P, __U, __A); 
+}
+__mmask64 test_mm512_test_epi8_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.512
+  return _mm512_test_epi8_mask(__A, __B); 
+}
+
+void test_mm512_mask_storeu_epi8(void *__P, __mmask64 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %{{.*}}, <64 x i8>* %{{.*}}, i32 1, <64 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi8(__P, __U, __A); 
+}
+__mmask64 test_mm512_mask_test_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.512
+  return _mm512_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm512_test_epi16_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.
+  return _mm512_test_epi16_mask(__A, __B); 
+}
+
+__mmask32 test_mm512_mask_test_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.
+  return _mm512_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask64 test_mm512_testn_epi8_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.
+  return _mm512_testn_epi8_mask(__A, __B); 
+}
+
+__mmask64 test_mm512_mask_testn_epi8_mask(__mmask64 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.
+  return _mm512_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm512_testn_epi16_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.
+  return _mm512_testn_epi16_mask(__A, __B); 
+}
+
+__mmask32 test_mm512_mask_testn_epi16_mask(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.
+  return _mm512_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask64 test_mm512_movepi8_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.512
+  return _mm512_movepi8_mask(__A); 
+}
+
+__m512i test_mm512_movm_epi8(__mmask64 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.512
+  return _mm512_movm_epi8(__A); 
+}
+
+__m512i test_mm512_movm_epi16(__mmask32 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.512
+  return _mm512_movm_epi16(__A); 
+}
+
+__m512i test_mm512_broadcastb_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  return _mm512_broadcastb_epi8(__A);
+}
+
+__m512i test_mm512_mask_broadcastb_epi8(__m512i __O, __mmask64 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastb_epi8(__mmask64 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> undef, <64 x i32> zeroinitializer
+  // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+  return _mm512_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m512i test_mm512_broadcastw_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  return _mm512_broadcastw_epi16(__A);
+}
+
+__m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastw_epi16(__mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> undef, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m512i test_mm512_mask_set1_epi16(__m512i __O, __mmask32 __M, short __A) {
+  // CHECK-LABEL: @test_mm512_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.512
+  return _mm512_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_set1_epi16(__mmask32 __M, short __A) {
+  // CHECK-LABEL: @test_mm512_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.512
+  return _mm512_maskz_set1_epi16(__M, __A); 
+}
+__m512i test_mm512_permutexvar_epi16(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+ return _mm512_permutexvar_epi16(__A, __B); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi16(__mmask32 __M, __m512i __A, __m512i __B) {
+ // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  return _mm512_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.512
+  return _mm512_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+__m512i test_mm512_alignr_epi8(__m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    return _mm512_alignr_epi8(__A, __B, 2); 
+}
+
+__m512i test_mm512_mask_alignr_epi8(__m512i __W, __mmask64 __U, __m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_mask_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+    return _mm512_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m512i test_mm512_maskz_alignr_epi8(__mmask64 __U, __m512i __A,__m512i __B){
+    // CHECK-LABEL: @test_mm512_maskz_alignr_epi8
+    // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> %{{.*}}, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113>
+    // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
+   return _mm512_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+
+
+__m512i test_mm512_mm_dbsad_epu8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_dbsad_epu8(__A, __B, 170); 
+}
+
+__m512i test_mm512_mm_mask_dbsad_epu8(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m512i test_mm512_mm_maskz_dbsad_epu8(__mmask32 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mm_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.512
+  return _mm512_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+
+__m512i test_mm512_sad_epu8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_sad_epu8
+  // CHECK: @llvm.x86.avx512.psad.bw.512
+  return _mm512_sad_epu8(__A, __B); 
+}
+
+__mmask32 test_mm512_movepi16_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.512
+  return _mm512_movepi16_mask(__A); 
+}
+
+void test_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.512
+ __builtin_ia32_pmovwb512mem_mask ( __P,  __A, __M);
+}
+
+void test_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.512
+ __builtin_ia32_pmovswb512mem_mask ( __P,  __A, __M);
+}
+
+void test_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ // CHECK-LABEL: @test_mm512_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.512
+ __builtin_ia32_pmovuswb512mem_mask ( __P, __A, __M);
+}
diff --git a/test/CodeGen/avx512cdintrin.c b/test/CodeGen/avx512cdintrin.c
index 625a3d2bae915..415a82c2c1917 100644
--- a/test/CodeGen/avx512cdintrin.c
+++ b/test/CodeGen/avx512cdintrin.c
@@ -37,31 +37,47 @@ __m512i test_mm512_maskz_conflict_epi32(__mmask16 __U, __m512i __A) {
 }
 __m512i test_mm512_lzcnt_epi32(__m512i __A) {
   // CHECK-LABEL: @test_mm512_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
   return _mm512_lzcnt_epi32(__A); 
 }
 __m512i test_mm512_mask_lzcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_lzcnt_epi32(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_lzcnt_epi32(__mmask16 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_lzcnt_epi32
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.d.512
+  // CHECK: call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %{{.*}}, i1 false)
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_lzcnt_epi32(__U,__A); 
 }
 __m512i test_mm512_lzcnt_epi64(__m512i __A) {
   // CHECK-LABEL: @test_mm512_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
   return _mm512_lzcnt_epi64(__A); 
 }
 __m512i test_mm512_mask_lzcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_mask_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_lzcnt_epi64(__W,__U,__A); 
 }
 __m512i test_mm512_maskz_lzcnt_epi64(__mmask8 __U, __m512i __A) {
   // CHECK-LABEL: @test_mm512_maskz_lzcnt_epi64
-  // CHECK: @llvm.x86.avx512.mask.lzcnt.q.512
+  // CHECK: call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_lzcnt_epi64(__U,__A); 
 }
+
+__m512i test_mm512_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.512
+  return _mm512_broadcastmb_epi64(__A); 
+}
+
+__m512i test_mm512_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.512
+  return _mm512_broadcastmw_epi32(__A); 
+}
diff --git a/test/CodeGen/avx512dq-builtins.c b/test/CodeGen/avx512dq-builtins.c
index fc09a28ac11c6..59a7cad7e3d10 100644
--- a/test/CodeGen/avx512dq-builtins.c
+++ b/test/CodeGen/avx512dq-builtins.c
@@ -635,6 +635,78 @@ __m512d test_mm512_maskz_range_round_pd(__mmask8 __U, __m512d __A, __m512d __B)
   return _mm512_maskz_range_round_pd(__U, __A, __B, 4, 8); 
 }
 
+__m128d test_mm512_range_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_range_round_sd(__A, __B, 4, 8); 
+}
+
+__m128d test_mm512_mask_range_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: test_mm512_mask_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_mask_range_round_sd(__W, __U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm512_maskz_range_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_range_round_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_maskz_range_round_sd(__U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm512_range_round_ss(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_range_round_ss(__A, __B, 4, 8); 
+}
+
+__m128d test_mm512_mask_range_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_mask_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_mask_range_round_ss(__W, __U, __A, __B, 4, 8); 
+}
+
+__m128 test_mm512_maskz_range_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_range_round_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_maskz_range_round_ss(__U, __A, __B, 4, 8); 
+}
+
+__m128d test_mm_range_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_range_sd(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_range_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: test_mm_mask_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_mask_range_sd(__W, __U, __A, __B, 4); 
+}
+
+__m128d test_mm_maskz_range_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_range_sd
+  // CHECK: @llvm.x86.avx512.mask.range.sd
+  return _mm_maskz_range_sd(__U, __A, __B, 4); 
+}
+
+__m128d test_mm_range_ss(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_range_ss(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_range_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_mask_range_ss(__W, __U, __A, __B, 4); 
+}
+
+__m128 test_mm_maskz_range_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_range_ss
+  // CHECK: @llvm.x86.avx512.mask.range.ss
+  return _mm_maskz_range_ss(__U, __A, __B, 4); 
+}
+
 __m512 test_mm512_range_ps(__m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_range_ps
   // CHECK: @llvm.x86.avx512.mask.range.ps.512
@@ -743,3 +815,397 @@ __m512 test_mm512_maskz_reduce_round_ps(__mmask16 __U, __m512 __A) {
   return _mm512_maskz_reduce_round_ps(__U, __A, 4, 8); 
 }
 
+__m128 test_mm_reduce_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_reduce_ss(__A, __B, 4);
+}
+
+__m128 test_mm_mask_reduce_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_mask_reduce_ss(__W, __U, __A, __B, 4);
+}
+
+__m128 test_mm_maskz_reduce_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_maskz_reduce_ss(__U, __A, __B, 4);
+}
+
+__m128 test_mm_reduce_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_reduce_round_ss(__A, __B, 4, 8);
+}
+
+__m128 test_mm_mask_reduce_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_mask_reduce_round_ss(__W, __U, __A, __B, 4, 8);
+}
+
+__m128 test_mm_maskz_reduce_round_ss(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_round_ss
+  // CHECK: @llvm.x86.avx512.mask.reduce.ss
+  return _mm_maskz_reduce_round_ss(__U, __A, __B, 4, 8);
+}
+
+__m128d test_mm_reduce_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_reduce_sd(__A, __B, 4);
+}
+
+__m128d test_mm_mask_reduce_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_mask_reduce_sd(__W, __U, __A, __B, 4);
+}
+
+__m128d test_mm_maskz_reduce_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_maskz_reduce_sd(__U, __A, __B, 4);
+}
+
+__m128d test_mm_reduce_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_reduce_round_sd(__A, __B, 4, 8);
+}
+
+__m128d test_mm_mask_reduce_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_mask_reduce_round_sd(__W, __U, __A, __B, 4, 8);
+}
+
+__m128d test_mm_maskz_reduce_round_sd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_reduce_round_sd
+  // CHECK: @llvm.x86.avx512.mask.reduce.sd
+  return _mm_maskz_reduce_round_sd(__U, __A, __B, 4, 8);
+}
+
+__mmask16 test_mm512_movepi32_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.512
+  return _mm512_movepi32_mask(__A); 
+}
+
+__m512i test_mm512_movm_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.512
+  return _mm512_movm_epi32(__A); 
+}
+
+__m512i test_mm512_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm512_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.512
+  return _mm512_movm_epi64(__A); 
+}
+
+__mmask8 test_mm512_movepi64_mask(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.512
+  return _mm512_movepi64_mask(__A); 
+}
+
+__m512 test_mm512_broadcast_f32x2(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_broadcast_f32x2(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x2(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_mask_broadcast_f32x2(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x2(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm512_maskz_broadcast_f32x2(__M, __A); 
+}
+
+__m512 test_mm512_broadcast_f32x8(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_broadcast_f32x8(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_mask_broadcast_f32x8(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x8
+  return _mm512_maskz_broadcast_f32x8(__M, __A); 
+}
+
+__m512d test_mm512_broadcast_f64x2(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_broadcast_f64x2(__A); 
+}
+
+__m512d test_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_mask_broadcast_f64x2(__O, __M, __A); 
+}
+
+__m512d test_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm512_maskz_broadcast_f64x2(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_broadcast_i32x2(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x2(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x2(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm512_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x8(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_broadcast_i32x8(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_mask_broadcast_i32x8(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x8
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x8
+  return _mm512_maskz_broadcast_i32x8(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i64x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_broadcast_i64x2(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_mask_broadcast_i64x2(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm512_maskz_broadcast_i64x2(__M, __A); 
+}
+__m256 test_mm512_extractf32x8_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_extractf32x8_ps(__A, 1); 
+}
+
+__m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); 
+}
+
+__m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extractf32x8_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x8
+  return _mm512_maskz_extractf32x8_ps(__U, __A, 1); 
+}
+
+__m128d test_mm512_extractf64x2_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_extractf64x2_pd(__A, 3); 
+}
+
+__m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); 
+}
+
+__m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm512_maskz_extractf64x2_pd(__U, __A, 3); 
+}
+
+__m256i test_mm512_extracti32x8_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_extracti32x8_epi32(__A, 1); 
+}
+
+__m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); 
+}
+
+__m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti32x8_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x8
+  return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); 
+}
+
+__m128i test_mm512_extracti64x2_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_extracti64x2_epi64(__A, 3); 
+}
+
+__m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); 
+}
+
+__m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); 
+}
+
+__m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_insertf32x8(__A, __B, 1); 
+}
+
+__m512 test_mm512_mask_insertf32x8(__m512 __W, __mmask16 __U, __m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_mask_insertf32x8(__W, __U, __A, __B, 1); 
+}
+
+__m512 test_mm512_maskz_insertf32x8(__mmask16 __U, __m512 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf32x8
+  // CHECK: @llvm.x86.avx512.mask.insertf32x8
+  return _mm512_maskz_insertf32x8(__U, __A, __B, 1); 
+}
+
+__m512d test_mm512_insertf64x2(__m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_insertf64x2(__A, __B, 3); 
+}
+
+__m512d test_mm512_mask_insertf64x2(__m512d __W, __mmask8 __U, __m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_mask_insertf64x2(__W, __U, __A, __B, 3); 
+}
+
+__m512d test_mm512_maskz_insertf64x2(__mmask8 __U, __m512d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm512_maskz_insertf64x2(__U, __A, __B, 3); 
+}
+
+__m512i test_mm512_inserti32x8(__m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_inserti32x8(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti32x8(__m512i __W, __mmask16 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_mask_inserti32x8(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti32x8(__mmask16 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti32x8
+  // CHECK: @llvm.x86.avx512.mask.inserti32x8
+  return _mm512_maskz_inserti32x8(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti64x2(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_inserti64x2(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti64x2(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_mask_inserti64x2(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti64x2(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm512_maskz_inserti64x2(__U, __A, __B, 1); 
+}
+__mmask8 test_mm512_mask_fpclass_pd_mask(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  return _mm512_mask_fpclass_pd_mask(__U, __A, 4); 
+}
+
+__mmask8 test_mm512_fpclass_pd_mask(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.512
+  return _mm512_fpclass_pd_mask(__A, 4); 
+}
+
+__mmask16 test_mm512_mask_fpclass_ps_mask(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  return _mm512_mask_fpclass_ps_mask(__U, __A, 4); 
+}
+
+__mmask16 test_mm512_fpclass_ps_mask(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.512
+  return _mm512_fpclass_ps_mask(__A, 4); 
+}
+
+__mmask8 test_mm_fpclass_sd_mask(__m128 __A)  { 
+  // CHECK-LABEL: @test_mm_fpclass_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.sd
+ return _mm_fpclass_sd_mask (__A, 2);
+}
+
+__mmask8 test_mm_mask_fpclass_sd_mask(__mmask8 __U, __m128 __A)  {
+ // CHECK-LABEL: @test_mm_mask_fpclass_sd_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.sd
+ return _mm_mask_fpclass_sd_mask (__U,  __A, 2);
+}
+
+__mmask8 test_mm_fpclass_ss_mask(__m128 __A)  { 
+ // CHECK-LABEL: @test_mm_fpclass_ss_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.ss
+ return _mm_fpclass_ss_mask ( __A, 2);
+}
+
+__mmask8 test_mm_mask_fpclass_ss_mask(__mmask8 __U, __m128 __A)  {
+ // CHECK-LABEL: @test_mm_mask_fpclass_ss_mask
+ // CHECK: @llvm.x86.avx512.mask.fpclass.ss
+ return _mm_mask_fpclass_ss_mask (__U, __A, 2);
+}
+
diff --git a/test/CodeGen/avx512f-builtins.c b/test/CodeGen/avx512f-builtins.c
index c1f4c0ecc52db..556c06f532cf5 100644
--- a/test/CodeGen/avx512f-builtins.c
+++ b/test/CodeGen/avx512f-builtins.c
@@ -12,6 +12,41 @@ __m512d test_mm512_sqrt_pd(__m512d a)
   return _mm512_sqrt_pd(a);
 }
 
+__m512d test_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_pd 
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_mask_sqrt_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_pd 
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_maskz_sqrt_pd (__U,__A);
+}
+
+__m512d test_mm512_mask_sqrt_round_pd(__m512d __W,__mmask8 __U,__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_mask_sqrt_round_pd(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_sqrt_round_pd(__mmask8 __U,__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_maskz_sqrt_round_pd(__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_sqrt_round_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_sqrt_round_pd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.pd.512
+  return _mm512_sqrt_round_pd(__A,_MM_FROUND_CUR_DIRECTION);
+}
+
 __m512 test_mm512_sqrt_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_sqrt_ps
@@ -19,6 +54,41 @@ __m512 test_mm512_sqrt_ps(__m512 a)
   return _mm512_sqrt_ps(a);
 }
 
+__m512 test_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_mask_sqrt_ps( __W, __U, __A);
+}
+
+__m512 test_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_maskz_sqrt_ps(__U ,__A);
+}
+
+__m512 test_mm512_mask_sqrt_round_ps(__m512 __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_mask_sqrt_round_ps(__W,__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_sqrt_round_ps(__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_maskz_sqrt_round_ps(__U,__A,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_sqrt_round_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_sqrt_round_ps
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ps.512
+  return _mm512_sqrt_round_ps(__A,_MM_FROUND_CUR_DIRECTION);
+}
+
 __m512d test_mm512_rsqrt14_pd(__m512d a)
 {
   // CHECK-LABEL: @test_mm512_rsqrt14_pd
@@ -26,6 +96,20 @@ __m512d test_mm512_rsqrt14_pd(__m512d a)
   return _mm512_rsqrt14_pd(a);
 }
 
+__m512d test_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rsqrt14_pd 
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
+  return _mm512_mask_rsqrt14_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_pd 
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.512
+  return _mm512_maskz_rsqrt14_pd (__U,__A);
+}
+
 __m512 test_mm512_rsqrt14_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_rsqrt14_ps
@@ -33,6 +117,20 @@ __m512 test_mm512_rsqrt14_ps(__m512 a)
   return _mm512_rsqrt14_ps(a);
 }
 
+__m512 test_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rsqrt14_ps 
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
+  return _mm512_mask_rsqrt14_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rsqrt14_ps 
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.512
+  return _mm512_maskz_rsqrt14_ps (__U,__A);
+}
+
 __m512 test_mm512_add_ps(__m512 a, __m512 b)
 {
   // CHECK-LABEL: @test_mm512_add_ps
@@ -61,27 +159,67 @@ __m512d test_mm512_mul_pd(__m512d a, __m512d b)
   return _mm512_mul_pd(a, b);
 }
 
+void test_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_storeu_si512
+  // CHECK: store <16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm512_storeu_si512 ( __P,__A);
+}
+
 void test_mm512_storeu_ps(void *p, __m512 a)
 {
   // CHECK-LABEL: @test_mm512_storeu_ps
-  // CHECK: @llvm.x86.avx512.mask.storeu.ps.512
+  // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm512_storeu_ps(p, a);
 }
 
 void test_mm512_storeu_pd(void *p, __m512d a)
 {
   // CHECK-LABEL: @test_mm512_storeu_pd
-  // CHECK: @llvm.x86.avx512.mask.storeu.pd.512
+  // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm512_storeu_pd(p, a);
 }
 
 void test_mm512_mask_store_ps(void *p, __m512 a, __mmask16 m)
 {
   // CHECK-LABEL: @test_mm512_mask_store_ps
-  // CHECK: @llvm.x86.avx512.mask.store.ps.512
+  // CHECK: @llvm.masked.store.v16f32.p0v16f32(<16 x float> %{{.*}}, <16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}})
   _mm512_mask_store_ps(p, m, a);
 }
 
+void test_mm512_store_si512 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_si512 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[SI512_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[SI512_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_si512 ( __P,__A);
+}
+
+void test_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_epi32 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[Si32_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[Si32_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_epi32 ( __P,__A);
+}
+
+void test_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_store_epi64 
+  // CHECK: load <8 x i64>, <8 x i64>* %__A.addr.i, align 64
+  // CHECK: [[SI64_3:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: bitcast i8* [[SI64_3]] to <8 x i64>*
+  // CHECK: store <8 x i64>  
+  _mm512_store_epi64 ( __P,__A);
+}
+
 void test_mm512_store_ps(void *p, __m512 a)
 {
   // CHECK-LABEL: @test_mm512_store_ps
@@ -89,18 +227,51 @@ void test_mm512_store_ps(void *p, __m512 a)
   _mm512_store_ps(p, a);
 }
 
+void test_mm512_store_pd(void *p, __m512d a)
+{
+  // CHECK-LABEL: @test_mm512_store_pd
+  // CHECK: store <8 x double>
+  _mm512_store_pd(p, a);
+}
+
 void test_mm512_mask_store_pd(void *p, __m512d a, __mmask8 m)
 {
   // CHECK-LABEL: @test_mm512_mask_store_pd
-  // CHECK: @llvm.x86.avx512.mask.store.pd.512
+  // CHECK: @llvm.masked.store.v8f64.p0v8f64(<8 x double> %{{.*}}, <8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}})
   _mm512_mask_store_pd(p, m, a);
 }
 
-void test_mm512_store_pd(void *p, __m512d a)
+void test_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v8i64.p0v8i64(<8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm512_mask_storeu_epi64(__P, __U, __A); 
+}
+
+__m512i test_mm512_loadu_si512 (void *__P)
 {
-  // CHECK-LABEL: @test_mm512_store_pd
-  // CHECK: store <8 x double>
-  _mm512_store_pd(p, a);
+  // CHECK-LABEL: @test_mm512_loadu_si512 
+  // CHECK: load <16 x i32>, <16 x i32>* %{{.*}}, align 1{{$}}
+  return _mm512_loadu_si512 ( __P);
+}
+
+__m512i test_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi32 
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_loadu_epi32 (__W,__U, __P);
+}
+
+__m512i test_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_epi64 
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_loadu_epi64 (__W,__U, __P);
 }
 
 __m512 test_mm512_loadu_ps(void *p)
@@ -110,6 +281,13 @@ __m512 test_mm512_loadu_ps(void *p)
   return _mm512_loadu_ps(p);
 }
 
+__m512 test_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_loadu_ps 
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_loadu_ps (__W,__U, __P);
+}
+
 __m512d test_mm512_loadu_pd(void *p)
 {
   // CHECK-LABEL: @test_mm512_loadu_pd
@@ -117,34 +295,82 @@ __m512d test_mm512_loadu_pd(void *p)
   return _mm512_loadu_pd(p);
 }
 
-__m512 test_mm512_maskz_load_ps(void *p, __mmask16 m)
+__m512d test_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_ps
-  // CHECK: @llvm.x86.avx512.mask.load.ps.512
-  return _mm512_maskz_load_ps(m, p);
+  // CHECK-LABEL: @test_mm512_mask_loadu_pd 
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_loadu_pd (__W,__U, __P);
+}
+
+__m512i test_mm512_load_si512 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_si512 
+  // CHECK: [[LI512_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI512_2:%.+]] = bitcast i8* [[LI512_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI512_2]], align 64
+  return _mm512_load_si512 ( __P);
+}
+
+__m512i test_mm512_load_epi32 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_epi32 
+  // CHECK: [[LI32_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI32_2:%.+]] = bitcast i8* [[LI32_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI32_2]], align 64
+  return _mm512_load_epi32 ( __P);
+}
+
+__m512i test_mm512_load_epi64 (void *__P)
+{
+  // CHECK-LABEL: @test_mm512_load_epi64 
+  // CHECK: [[LI64_1:%.+]] = load i8*, i8** %__P.addr.i, align 8
+  // CHECK: [[LI64_2:%.+]] = bitcast i8* [[LI64_1]] to <8 x i64>*
+  // CHECK: load <8 x i64>, <8 x i64>* [[LI64_2]], align 64
+  return _mm512_load_epi64 ( __P);
 }
 
 __m512 test_mm512_load_ps(void *p)
 {
   // CHECK-LABEL: @test_mm512_load_ps
-  // CHECK: @llvm.x86.avx512.mask.load.ps.512
+  // CHECK: load <16 x float>, <16 x float>* %{{.*}}, align 64
   return _mm512_load_ps(p);
 }
 
-__m512d test_mm512_maskz_load_pd(void *p, __mmask8 m)
+__m512 test_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void *__P)
 {
-  // CHECK-LABEL: @test_mm512_maskz_load_pd
-  // CHECK: @llvm.x86.avx512.mask.load.pd.512
-  return _mm512_maskz_load_pd(m, p);
+  // CHECK-LABEL: @test_mm512_mask_load_ps 
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_mask_load_ps (__W,__U, __P);
+}
+
+__m512 test_mm512_maskz_load_ps(__mmask16 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_maskz_load_ps
+  // CHECK: @llvm.masked.load.v16f32.p0v16f32(<16 x float>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x float> %{{.*}})
+  return _mm512_maskz_load_ps(__U, __P);
 }
 
 __m512d test_mm512_load_pd(void *p)
 {
   // CHECK-LABEL: @test_mm512_load_pd
-  // CHECK: @llvm.x86.avx512.mask.load.pd.512
+  // CHECK: load <8 x double>, <8 x double>* %{{.*}}, align 64
   return _mm512_load_pd(p);
 }
 
+__m512d test_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_mask_load_pd 
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_mask_load_pd (__W,__U, __P);
+}
+
+__m512d test_mm512_maskz_load_pd(__mmask8 __U, void *__P)
+{
+  // CHECK-LABEL: @test_mm512_maskz_load_pd
+  // CHECK: @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x double> %{{.*}})
+  return _mm512_maskz_load_pd(__U, __P);
+}
+
 __m512d test_mm512_set1_pd(double d)
 {
   // CHECK-LABEL: @test_mm512_set1_pd
@@ -159,13 +385,6 @@ __m512d test_mm512_set1_pd(double d)
   return _mm512_set1_pd(d);
 }
 
-__m512d test_mm512_castpd256_pd512(__m256d a)
-{
-  // CHECK-LABEL: @test_mm512_castpd256_pd512
-  // CHECK: shufflevector <4 x double> {{.*}} <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-  return _mm512_castpd256_pd512(a);
-}
-
 __mmask16 test_mm512_knot(__mmask16 a)
 {
   // CHECK-LABEL: @test_mm512_knot
@@ -180,6 +399,20 @@ __m512i test_mm512_alignr_epi32(__m512i a, __m512i b)
   return _mm512_alignr_epi32(a, b, 2);
 }
 
+__m512i test_mm512_mask_alignr_epi32(__m512i w, __mmask16 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  return _mm512_mask_alignr_epi32(w, u, a, b, 2);
+}
+
+__m512i test_mm512_maskz_alignr_epi32( __mmask16 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.512
+  return _mm512_maskz_alignr_epi32(u, a, b, 2);
+}
+
 __m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
 {
   // CHECK-LABEL: @test_mm512_alignr_epi64
@@ -187,18 +420,18 @@ __m512i test_mm512_alignr_epi64(__m512i a, __m512i b)
   return _mm512_alignr_epi64(a, b, 2);
 }
 
-__m512d test_mm512_broadcastsd_pd(__m128d a)
+__m512i test_mm512_mask_alignr_epi64(__m512i w, __mmask8 u, __m512i a, __m512i b)
 {
-  // CHECK-LABEL: @test_mm512_broadcastsd_pd
-  // CHECK: insertelement <8 x double> {{.*}}, i32 0
-  // CHECK: insertelement <8 x double> {{.*}}, i32 1
-  // CHECK: insertelement <8 x double> {{.*}}, i32 2
-  // CHECK: insertelement <8 x double> {{.*}}, i32 3
-  // CHECK: insertelement <8 x double> {{.*}}, i32 4
-  // CHECK: insertelement <8 x double> {{.*}}, i32 5
-  // CHECK: insertelement <8 x double> {{.*}}, i32 6
-  // CHECK: insertelement <8 x double> {{.*}}, i32 7
-  return _mm512_broadcastsd_pd(a);
+  // CHECK-LABEL: @test_mm512_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  return _mm512_mask_alignr_epi64(w, u, a, b, 2);
+}
+
+__m512i test_mm512_maskz_alignr_epi64( __mmask8 u, __m512i a, __m512i b)
+{
+  // CHECK-LABEL: @test_mm512_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.512
+  return _mm512_maskz_alignr_epi64(u, a, b, 2);
 }
 
 __m512d test_mm512_fmadd_round_pd(__m512d __A, __m512d __B, __m512d __C) {
@@ -685,49 +918,53 @@ __m512 test_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16
 
 __mmask16 test_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.d.512
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.d.512
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.q.512
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.q.512
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epi64_mask(__a, __b);
 }
 
 __mmask16 test_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.d.512
+  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.d.512
+  // CHECK: icmp sgt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.q.512
+  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.q.512
+  // CHECK: icmp sgt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epi64_mask(__a, __b);
 }
 
@@ -762,13 +999,13 @@ __m512 test_mm512_unpacklo_ps(__m512 a, __m512 b)
 __mmask16 test_mm512_cmp_round_ps_mask(__m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_cmp_round_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_cmp_round_ps_mask(a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_cmp_round_ps_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask16 test_mm512_mask_cmp_round_ps_mask(__mmask16 m, __m512 a, __m512 b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.512
-  return _mm512_mask_cmp_round_ps_mask(m, a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_cmp_round_ps_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask16 test_mm512_cmp_ps_mask(__m512 a, __m512 b) {
@@ -786,13 +1023,13 @@ __mmask16 test_mm512_mask_cmp_ps_mask(__mmask16 m, __m512 a, __m512 b) {
 __mmask8 test_mm512_cmp_round_pd_mask(__m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_cmp_round_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_cmp_round_pd_mask(a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_cmp_round_pd_mask(a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask8 test_mm512_mask_cmp_round_pd_mask(__mmask8 m, __m512d a, __m512d b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_round_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.512
-  return _mm512_mask_cmp_round_pd_mask(m, a, b, 0, _MM_FROUND_TO_NEAREST_INT);
+  return _mm512_mask_cmp_round_pd_mask(m, a, b, 0, _MM_FROUND_CUR_DIRECTION);
 }
 
 __mmask8 test_mm512_cmp_pd_mask(__m512d a, __m512d b) {
@@ -814,6 +1051,18 @@ __m256d test_mm512_extractf64x4_pd(__m512d a)
   return _mm512_extractf64x4_pd(a, 1);
 }
 
+__m256d test_mm512_mask_extractf64x4_pd(__m256d  __W,__mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_mask_extractf64x4_pd
+ //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
+ return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1);
+}
+
+__m256d test_mm512_maskz_extractf64x4_pd(__mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_maskz_extractf64x4_pd
+ //CHECL:@llvm.x86.avx512.mask.vextractf64x4.512
+ return _mm512_maskz_extractf64x4_pd( __U, __A, 1);
+}
+
 __m128 test_mm512_extractf32x4_ps(__m512 a)
 {
   // CHECK-LABEL: @test_mm512_extractf32x4_ps
@@ -821,369 +1070,429 @@ __m128 test_mm512_extractf32x4_ps(__m512 a)
   return _mm512_extractf32x4_ps(a, 1);
 }
 
+__m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_mask_extractf32x4_ps
+ //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
+ return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1);
+}
+
+__m128 test_mm512_maskz_extractf32x4_ps( __mmask8  __U,__m512d __A){
+ //CHECK-LABEL:@test_mm512_maskz_extractf32x4_ps
+ //CHECL: @llvm.x86.avx512.mask.vextractf32x4.512
+ return _mm512_maskz_extractf32x4_ps(  __U, __A, 1);
+}
+
 __mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 0, i16 -1)
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpeq_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 0, i16 {{.*}})
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpeq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpeq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpeq_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 -1)
+  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 {{.*}})
+  // CHECK: icmp sge <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 -1)
+  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 5, i16 {{.*}})
+  // CHECK: icmp uge <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 6, i16 -1)
+  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 6, i16 {{.*}})
+  // CHECK: icmp ugt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 -1)
+  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 {{.*}})
+  // CHECK: icmp sle <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 -1)
+  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmple_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 2, i16 {{.*}})
+  // CHECK: icmp ule <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 -1)
+  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 {{.*}})
+  // CHECK: icmp slt <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 -1)
+  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 1, i16 {{.*}})
+  // CHECK: icmp ult <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 -1)
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 {{.*}})
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 -1)
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask16 test_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 4, i16 {{.*}})
+  // CHECK: icmp ne <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm512_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm512_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm512_cmp_epi32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 -1)
-  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epi32_mask(__a, __b, 0);
 }
 
 __mmask16 test_mm512_mask_cmp_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 {{.*}})
-  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epi32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm512_cmp_epi64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 -1)
-  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_cmp_epi64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm512_mask_cmp_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 {{.*}})
-  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_mask_cmp_epi64_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm512_cmp_epu32_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 -1)
-  return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask16 test_mm512_mask_cmp_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> {{.*}}, <16 x i32> {{.*}}, i32 3, i16 {{.*}})
-  return (__mmask16)_mm512_mask_cmp_epu32_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm512_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm512_cmp_epu64_mask(__m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 -1)
-  return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm512_mask_cmp_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> {{.*}}, <8 x i64> {{.*}}, i32 3, i8 {{.*}})
-  return (__mmask8)_mm512_mask_cmp_epu64_mask(__u, __a, __b, 3);
+  // CHECK: icmp eq <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm512_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __m512i test_mm512_mask_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_and_epi32
-  // CHECK: @llvm.x86.avx512.mask.pand.d.512
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_and_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_and_epi32
-  // CHECK: @llvm.x86.avx512.mask.pand.d.512
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_and_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_and_epi64
-  // CHECK: @llvm.x86.avx512.mask.pand.q.512
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_and_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_and_epi64
-  // CHECK: @llvm.x86.avx512.mask.pand.q.512
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_and_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_or_epi32
-  // CHECK: @llvm.x86.avx512.mask.por.d.512
+  // CHECK: or <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_or_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_or_epi32
-  // CHECK: @llvm.x86.avx512.mask.por.d.512
+  // CHECK: or <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_or_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_or_epi64
-  // CHECK: @llvm.x86.avx512.mask.por.q.512
+  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_or_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_or_epi64
-  // CHECK: @llvm.x86.avx512.mask.por.q.512
+  // CHECK: %[[OR_RES:.*]] = or <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[OR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_or_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_xor_epi32
-  // CHECK: @llvm.x86.avx512.mask.pxor.d.512
+  // CHECK: xor <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_xor_epi32(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_xor_epi32
-  // CHECK: @llvm.x86.avx512.mask.pxor.d.512
+  // CHECK: xor <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_xor_epi32(__k,__a, __b);
 }
 
 __m512i test_mm512_mask_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_mask_xor_epi64
-  // CHECK: @llvm.x86.avx512.mask.pxor.q.512
+  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_mask_xor_epi64(__src, __k,__a, __b);
 }
 
 __m512i test_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_maskz_xor_epi64
-  // CHECK: @llvm.x86.avx512.mask.pxor.q.512
+  // CHECK: %[[XOR_RES:.*]] = xor <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[XOR_RES]], <8 x i64> %{{.*}}
   return _mm512_maskz_xor_epi64(__k,__a, __b);
 }
 
 __m512i test_mm512_and_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_and_epi32
-  // CHECK: and <8 x i64>
+  // CHECK: and <16 x i32>
   return _mm512_and_epi32(__a, __b);
 }
 
@@ -1195,7 +1504,7 @@ __m512i test_mm512_and_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __
 
 __m512i test_mm512_or_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_or_epi32
-  // CHECK: or <8 x i64>
+  // CHECK: or <16 x i32>
   return _mm512_or_epi32(__a, __b);
 }
 
@@ -1207,7 +1516,7 @@ __m512i test_mm512_or_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __b
 
 __m512i test_mm512_xor_epi32(__m512i __src,__mmask16 __k, __m512i __a, __m512i __b) {
   // CHECK-LABEL: @test_mm512_xor_epi32
-  // CHECK: xor <8 x i64>
+  // CHECK: xor <16 x i32>
   return _mm512_xor_epi32(__a, __b);
 }
 
@@ -1218,40 +1527,61 @@ __m512i test_mm512_xor_epi64(__m512i __src,__mmask8 __k, __m512i __a, __m512i __
 }
 
 __m512i test_mm512_maskz_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B){
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_maskz_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_andnot_epi32(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B,
                                       __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_mask_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_andnot_epi32(__src,__k,__A,__B);
 }
 
+__m512i test_mm512_andnot_si512(__m512i __A, __m512i __B)
+{
+  //CHECK-LABEL: @test_mm512_andnot_si512
+  //CHECK: load {{.*}}%__A.addr.i, align 64
+  //CHECK: %neg.i = xor{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: load {{.*}}%__B.addr.i, align 64
+  //CHECK: and <8 x i64> %neg.i,{{.*}}
+
+  return _mm512_andnot_si512(__A, __B);
+}
+
 __m512i test_mm512_andnot_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
+  // CHECK-LABEL: @test_mm512_andnot_epi32
+  // CHECK: xor <16 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi32(__A,__B);
 }
 
 __m512i test_mm512_maskz_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK-LABEL: @test_mm512_maskz_andnot_epi64
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_andnot_epi64(__k,__A,__B);
 }
 
 __m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B, 
                                       __m512i __src) {
   //CHECK-LABEL: @test_mm512_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_mask_andnot_epi64(__src,__k,__A,__B);
 }
 
 __m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
   //CHECK-LABEL: @test_mm512_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
+  // CHECK: xor <8 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>
+  // CHECK: and <8 x i64> %{{.*}}, %{{.*}}
   return _mm512_andnot_epi64(__A,__B);
 }
 
@@ -1690,10 +2020,15 @@ __m512d test_mm512_maskz_div_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
   return _mm512_maskz_div_round_pd(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
 }
-__m512d test_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
-  // CHECK-LABEL: @test_mm512_mask_div_pd
+__m512d test_mm512_div_pd(__m512d __a, __m512d __b) {
+  // CHECK-LABLE: @test_mm512_div_pd
+  // CHECK: fdiv <8 x double>
+  return _mm512_div_pd(__a,__b); 
+}
+__m512d test_mm512_mask_div_pd(__m512d __w, __mmask8 __u, __m512d __a, __m512d __b) {
+  // CHECK-LABLE: @test_mm512_mask_div_pd
   // CHECK: @llvm.x86.avx512.mask.div.pd.512
-  return _mm512_mask_div_pd(__W,__U,__A,__B); 
+  return _mm512_mask_div_pd(__w,__u,__a,__b); 
 }
 __m512d test_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
   // CHECK-LABEL: @test_mm512_maskz_div_pd
@@ -1715,6 +2050,11 @@ __m512 test_mm512_maskz_div_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
   return _mm512_maskz_div_round_ps(__U,__A,__B,_MM_FROUND_TO_NEAREST_INT); 
 }
+__m512 test_mm512_div_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_div_ps
+  // CHECK: fdiv <16 x float>
+  return _mm512_div_ps(__A,__B); 
+}
 __m512 test_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
   // CHECK-LABEL: @test_mm512_mask_div_ps
   // CHECK: @llvm.x86.avx512.mask.div.ps.512
@@ -1899,3 +2239,5341 @@ __m512i test_mm512_undefined_epi32() {
   // CHECK: ret <8 x i64> undef
   return _mm512_undefined_epi32();
 }
+
+__m512i test_mm512_cvtepi8_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_cvtepi8_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.512
+  return _mm512_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi8_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_cvtepi8_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.512
+  return _mm512_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi32_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_cvtepi32_epi64(__X); 
+}
+
+__m512i test_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m512i test_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.512
+  return _mm512_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m512i test_mm512_cvtepi16_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_cvtepi16_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.512
+  return _mm512_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepi16_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_cvtepi16_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.512
+  return _mm512_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_cvtepu8_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.512
+  return _mm512_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu8_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_cvtepu8_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.512
+  return _mm512_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu32_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm512_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_cvtepu32_epi64(__X); 
+}
+
+__m512i test_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m512i test_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.512
+  return _mm512_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m512i test_mm512_cvtepu16_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_cvtepu16_epi32(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.512
+  return _mm512_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m512i test_mm512_cvtepu16_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_cvtepu16_epi64(__A); 
+}
+
+__m512i test_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.512
+  return _mm512_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+
+__m512i test_mm512_rol_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_rol_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_rol_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_rol_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.512
+  return _mm512_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_rol_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_rol_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_rol_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_rol_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.512
+  return _mm512_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_rolv_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_rolv_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_rolv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rolv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.512
+  return _mm512_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_rolv_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_rolv_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_rolv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rolv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.512
+  return _mm512_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_ror_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_ror_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_ror_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_ror_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.512
+  return _mm512_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_ror_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_ror_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_ror_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_mask_ror_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_ror_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.512
+  return _mm512_maskz_ror_epi64(__U, __A, 5); 
+}
+
+
+__m512i test_mm512_rorv_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_rorv_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_rorv_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rorv_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.512
+  return _mm512_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_rorv_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_rorv_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_rorv_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_rorv_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.512
+  return _mm512_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_slli_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_slli_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_mask_slli_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.di.512
+  return _mm512_maskz_slli_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_slli_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_slli_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_mask_slli_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_slli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.qi.512
+  return _mm512_maskz_slli_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_srli_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_srli_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.512
+  return _mm512_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_srli_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_srli_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.512
+  return _mm512_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_mask_load_epi32(__m512i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_load_epi32
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_mask_load_epi32(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_load_epi32(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}}, <16 x i32> %{{.*}})
+  return _mm512_maskz_load_epi32(__U, __P); 
+}
+
+__m512i test_mm512_mask_mov_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi32
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_mov_epi32(__U, __A); 
+}
+
+__m512i test_mm512_mask_mov_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_epi64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_mov_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_epi64
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_mov_epi64(__U, __A); 
+}
+
+__m512i test_mm512_mask_load_epi64(__m512i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_load_epi64
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_mask_load_epi64(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v8i64.p0v8i64(<8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}}, <8 x i64> %{{.*}})
+  return _mm512_maskz_load_epi64(__U, __P); 
+}
+
+void test_mm512_mask_store_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_store_epi32
+  // CHECK: @llvm.masked.store.v16i32.p0v16i32(<16 x i32> %{{.*}}, <16 x i32>* %{{.*}}, i32 64, <16 x i1> %{{.*}})
+  return _mm512_mask_store_epi32(__P, __U, __A); 
+}
+
+void test_mm512_mask_store_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_store_epi64
+  // CHECK: @llvm.masked.store.v8i64.p0v8i64(<8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, i32 64, <8 x i1> %{{.*}})
+  return _mm512_mask_store_epi64(__P, __U, __A); 
+}
+
+__m512d test_mm512_movedup_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm512_movedup_pd(__A);
+}
+
+__m512d test_mm512_mask_movedup_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_movedup_pd(__W, __U, __A);
+}
+
+__m512d test_mm512_maskz_movedup_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_movedup_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_movedup_pd(__U, __A);
+}
+
+int test_mm_comi_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_comi_round_sd
+  // CHECK: @llvm.x86.avx512.vcomi.sd
+  return _mm_comi_round_sd(__A, __B, 5, 3); 
+}
+
+int test_mm_comi_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_comi_round_ss
+  // CHECK: @llvm.x86.avx512.vcomi.ss
+  return _mm_comi_round_ss(__A, __B, 5, 3); 
+}
+
+__m512d test_mm512_fixupimm_round_pd(__m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_fixupimm_round_pd(__A, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_mask_fixupimm_round_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_mask_fixupimm_round_pd(__A, __U, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_fixupimm_pd(__m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m512d test_mm512_mask_fixupimm_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.512
+  return _mm512_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m512d test_mm512_maskz_fixupimm_round_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
+  return _mm512_maskz_fixupimm_round_pd(__U, __A, __B, __C, 5, 8); 
+}
+
+__m512d test_mm512_maskz_fixupimm_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.512
+  return _mm512_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m512 test_mm512_fixupimm_round_ps(__m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_fixupimm_round_ps(__A, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_mask_fixupimm_round_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_mask_fixupimm_round_ps(__A, __U, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_fixupimm_ps(__m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m512 test_mm512_mask_fixupimm_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.512
+  return _mm512_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m512 test_mm512_maskz_fixupimm_round_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_round_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
+  return _mm512_maskz_fixupimm_round_ps(__U, __A, __B, __C, 5, 8); 
+}
+
+__m512 test_mm512_maskz_fixupimm_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.512
+  return _mm512_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_fixupimm_round_sd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_round_sd(__A, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_mask_fixupimm_round_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_round_sd(__A, __U, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_fixupimm_sd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_sd(__A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_fixupimm_sd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_sd(__A, __U, __B, __C, 5); 
+}
+
+__m128d test_mm_maskz_fixupimm_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_round_sd(__U, __A, __B, __C, 5, 8); 
+}
+
+__m128d test_mm_maskz_fixupimm_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_sd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_sd(__U, __A, __B, __C, 5); 
+}
+
+__m128 test_mm_fixupimm_round_ss(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_round_ss(__A, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_mask_fixupimm_round_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_round_ss(__A, __U, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_fixupimm_ss(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_fixupimm_ss(__A, __B, __C, 5); 
+}
+
+__m128 test_mm_mask_fixupimm_ss(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.mask.fixupimm
+  return _mm_mask_fixupimm_ss(__A, __U, __B, __C, 5); 
+}
+
+__m128 test_mm_maskz_fixupimm_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_round_ss(__U, __A, __B, __C, 5, 8); 
+}
+
+__m128 test_mm_maskz_fixupimm_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_ss
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm
+  return _mm_maskz_fixupimm_ss(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_getexp_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_getexp_round_sd(__A, __B, 8); 
+}
+
+__m128d test_mm_getexp_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_getexp_sd(__A, __B); 
+}
+
+__m128 test_mm_getexp_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_getexp_round_ss(__A, __B, 8); 
+}
+
+__m128 test_mm_getexp_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_getexp_ss(__A, __B); 
+}
+
+__m128d test_mm_getmant_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_getmant_round_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+}
+
+__m128d test_mm_getmant_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_getmant_sd(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+}
+
+__m128 test_mm_getmant_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_getmant_round_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src, 8); 
+}
+
+__m128 test_mm_getmant_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_getmant_ss(__A, __B, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_src); 
+}
+
+__mmask16 test_mm512_kmov(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm512_kmov
+  // CHECK: load i16, i16* %__A.addr.i, align 2
+  return _mm512_kmov(__A); 
+}
+
+__m512d test_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+unsigned long long test_mm_cvt_roundsd_si64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_si64
+  // CHECK: @llvm.x86.avx512.vcvtsd2si64
+  return _mm_cvt_roundsd_si64(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+__m512i test_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi32
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.d.512
+  return _mm512_mask2_permutex2var_epi32(__A, __I, __U, __B); 
+}
+__m512i test_mm512_unpackhi_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  return _mm512_unpackhi_epi32(__A, __B); 
+}
+
+__m512d test_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_unpackhi_pd(__U, __A, __B); 
+}
+long long test_mm_cvt_roundsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_i64
+  // CHECK: @llvm.x86.avx512.vcvtsd2si64
+  return _mm_cvt_roundsd_i64(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+__m512d test_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.pd.512
+  return _mm512_mask2_permutex2var_pd(__A, __I, __U, __B); 
+}
+__m512i test_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_unpackhi_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m512d test_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m512d test_mm512_maskz_unpacklo_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m512 test_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_unpacklo_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_unpacklo_ps(__U, __A, __B); 
+}
+int test_mm_cvt_roundsd_si32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_si32
+  // CHECK: @llvm.x86.avx512.vcvtsd2si32
+  return _mm_cvt_roundsd_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvt_roundsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_i32
+  // CHECK: @llvm.x86.avx512.vcvtsd2si32
+  return _mm_cvt_roundsd_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvt_roundsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_u32
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
+  return _mm_cvt_roundsd_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvtsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtsd_u32
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi32
+  return _mm_cvtsd_u32(__A); 
+}
+
+unsigned long long test_mm_cvt_roundsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_u64
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
+  return _mm_cvt_roundsd_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvtsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtsd_u64
+  // CHECK: @llvm.x86.avx512.vcvtsd2usi64
+  return _mm_cvtsd_u64(__A); 
+}
+
+int test_mm_cvt_roundss_si32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_si32
+  // CHECK: @llvm.x86.avx512.vcvtss2si32
+  return _mm_cvt_roundss_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvt_roundss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_i32
+  // CHECK: @llvm.x86.avx512.vcvtss2si32
+  return _mm_cvt_roundss_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvt_roundss_si64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_si64
+  // CHECK: @llvm.x86.avx512.vcvtss2si64
+  return _mm_cvt_roundss_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvt_roundss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_i64
+  // CHECK: @llvm.x86.avx512.vcvtss2si64
+  return _mm_cvt_roundss_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvt_roundss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_u32
+  // CHECK: @llvm.x86.avx512.vcvtss2usi32
+  return _mm_cvt_roundss_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvtss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtss_u32
+  // CHECK: @llvm.x86.avx512.vcvtss2usi32
+  return _mm_cvtss_u32(__A); 
+}
+
+unsigned long long test_mm_cvt_roundss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_u64
+  // CHECK: @llvm.x86.avx512.vcvtss2usi64
+  return _mm_cvt_roundss_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvtss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtss_u64
+  // CHECK: @llvm.x86.avx512.vcvtss2usi64
+  return _mm_cvtss_u64(__A); 
+}
+
+int test_mm_cvtt_roundsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_i32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvtt_roundsd_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvtt_roundsd_si32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_si32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvtt_roundsd_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvttsd_i32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_i32
+  // CHECK: @llvm.x86.avx512.cvttsd2si
+  return _mm_cvttsd_i32(__A); 
+}
+
+unsigned long long test_mm_cvtt_roundsd_si64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_si64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvtt_roundsd_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvtt_roundsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_i64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvtt_roundsd_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvttsd_i64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_i64
+  // CHECK: @llvm.x86.avx512.cvttsd2si64
+  return _mm_cvttsd_i64(__A); 
+}
+
+unsigned test_mm_cvtt_roundsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_u32
+  // CHECK: @llvm.x86.avx512.cvttsd2usi
+  return _mm_cvtt_roundsd_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvttsd_u32(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_u32
+  // CHECK: @llvm.x86.avx512.cvttsd2usi
+  return _mm_cvttsd_u32(__A); 
+}
+
+unsigned long long test_mm_cvtt_roundsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundsd_u64
+  // CHECK: @llvm.x86.avx512.cvttsd2usi64
+  return _mm_cvtt_roundsd_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvttsd_u64(__m128d __A) {
+  // CHECK-LABEL: @test_mm_cvttsd_u64
+  // CHECK: @llvm.x86.avx512.cvttsd2usi64
+  return _mm_cvttsd_u64(__A); 
+}
+
+int test_mm_cvtt_roundss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_i32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvtt_roundss_i32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvtt_roundss_si32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_si32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvtt_roundss_si32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+int test_mm_cvttss_i32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_i32
+  // CHECK: @llvm.x86.avx512.cvttss2si
+  return _mm_cvttss_i32(__A); 
+}
+
+float test_mm_cvtt_roundss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_i64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvtt_roundss_i64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvtt_roundss_si64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_si64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvtt_roundss_si64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+long long test_mm_cvttss_i64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_i64
+  // CHECK: @llvm.x86.avx512.cvttss2si64
+  return _mm_cvttss_i64(__A); 
+}
+
+unsigned test_mm_cvtt_roundss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_u32
+  // CHECK: @llvm.x86.avx512.cvttss2usi
+  return _mm_cvtt_roundss_u32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned test_mm_cvttss_u32(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_u32
+  // CHECK: @llvm.x86.avx512.cvttss2usi
+  return _mm_cvttss_u32(__A); 
+}
+
+unsigned long long test_mm_cvtt_roundss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvtt_roundss_u64
+  // CHECK: @llvm.x86.avx512.cvttss2usi64
+  return _mm_cvtt_roundss_u64(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+unsigned long long test_mm_cvttss_u64(__m128 __A) {
+  // CHECK-LABEL: @test_mm_cvttss_u64
+  // CHECK: @llvm.x86.avx512.cvttss2usi64
+  return _mm_cvttss_u64(__A); 
+}
+
+__m512i test_mm512_cvtt_roundps_epu32(__m512 __A) 
+{
+    // CHECK-LABEL: @test_mm512_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+    return _mm512_cvtt_roundps_epu32(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_mask_cvtt_roundps_epu32(__m512i __W, __mmask16 __U, __m512 __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+    return _mm512_mask_cvtt_roundps_epu32(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_maskz_cvtt_roundps_epu32( __mmask16 __U, __m512 __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epu32
+    // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+
+    return _mm512_maskz_cvtt_roundps_epu32(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_cvt_roundps_ph(__m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_cvt_roundps_ph(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_mask_cvt_roundps_ph(__m256i __W , __mmask16 __U, __m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_maskz_cvt_roundps_ph(__mmask16 __U, __m512  __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_ph
+    // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+    return _mm512_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_cvt_roundph_ps(__m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_cvt_roundph_ps(__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_cvt_roundph_ps(__m512 __W, __mmask16 __U, __m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_mask_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_mask_cvt_roundph_ps(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_cvt_roundph_ps(__mmask16 __U, __m256i __A)
+{
+    // CHECK-LABEL: @test_mm512_maskz_cvt_roundph_ps
+    // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+    return _mm512_maskz_cvt_roundph_ps(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_cvt_roundepi32_ps(__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundepi32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_mask_cvt_roundepi32_ps(__W,__U,__A,4);
+}
+
+__m512 test_mm512_maskz_cvt_roundepi32_ps(__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepi32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_maskz_cvt_roundepi32_ps(__U,__A,4);
+}
+
+__m512 test_mm512_mask_cvt_roundepu32_ps(__m512 __W, __mmask16 __U,__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundepu32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_mask_cvt_roundepu32_ps(__W,__U,__A,4);
+}
+
+__m512 test_mm512_maskz_cvt_roundepu32_ps(__mmask16 __U,__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundepu32_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_maskz_cvt_roundepu32_ps(__U,__A,4);
+}
+
+__m256 test_mm512_mask_cvt_roundpd_ps(__m256 W, __mmask8 U,__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_mask_cvt_roundpd_ps(W,U,A,4);
+}
+
+__m256 test_mm512_maskz_cvt_roundpd_ps(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_ps
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_maskz_cvt_roundpd_ps(U,A,4);
+}
+
+__m256i test_mm512_cvtt_roundpd_epi32(__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_cvtt_roundpd_epi32(A,4);
+}
+
+__m256i test_mm512_mask_cvtt_roundpd_epi32(__m256i W, __mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_mask_cvtt_roundpd_epi32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvtt_roundpd_epi32(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_maskz_cvtt_roundpd_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvtt_roundps_epi32(__m512i W,__mmask16 U, __m512 A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_mask_cvtt_roundps_epi32(W,U,A,4);
+}
+
+__m512i test_mm512_maskz_cvtt_roundps_epi32(__mmask16 U, __m512 A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_maskz_cvtt_roundps_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvt_roundps_epi32(__m512i __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_mask_cvt_roundps_epi32(__W,__U,__A,4);
+}
+
+__m512i test_mm512_maskz_cvt_roundps_epi32(__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_maskz_cvt_roundps_epi32(__U,__A,4);
+}
+
+__m256i test_mm512_mask_cvt_roundpd_epi32(__m256i W,__mmask8 U,__m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_mask_cvt_roundpd_epi32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvt_roundpd_epi32(__mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epi32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_maskz_cvt_roundpd_epi32(U,A,4);
+}
+
+__m512i test_mm512_mask_cvt_roundps_epu32(__m512i __W,__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_mask_cvt_roundps_epu32(__W,__U,__A,4);
+}
+
+__m512i test_mm512_maskz_cvt_roundps_epu32(__mmask16 __U,__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_maskz_cvt_roundps_epu32(__U,__A, 4);
+}
+
+__m256i test_mm512_mask_cvt_roundpd_epu32(__m256i W, __mmask8 U, __m512d A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_mask_cvt_roundpd_epu32(W,U,A,4);
+}
+
+__m256i test_mm512_maskz_cvt_roundpd_epu32(__mmask8 U, __m512d A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_maskz_cvt_roundpd_epu32(U, A, 4);
+}
+
+__m512 test_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.ps.512
+  return _mm512_mask2_permutex2var_ps(__A, __I, __U, __B); 
+}
+
+__m512i test_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.q.512
+  return _mm512_mask2_permutex2var_epi64(__A, __I, __U, __B); 
+}
+
+__m512d test_mm512_permute_pd(__m512d __X) {
+  // CHECK-LABEL: @test_mm512_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  return _mm512_permute_pd(__X, 2);
+}
+
+__m512d test_mm512_mask_permute_pd(__m512d __W, __mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_mask_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_permute_pd(__W, __U, __X, 2);
+}
+
+__m512d test_mm512_maskz_permute_pd(__mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permute_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_permute_pd(__U, __X, 2);
+}
+
+__m512 test_mm512_permute_ps(__m512 __X) {
+  // CHECK-LABEL: @test_mm512_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  return _mm512_permute_ps(__X, 2);
+}
+
+__m512 test_mm512_mask_permute_ps(__m512 __W, __mmask16 __U, __m512 __X) {
+  // CHECK-LABEL: @test_mm512_mask_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_permute_ps(__W, __U, __X, 2);
+}
+
+__m512 test_mm512_maskz_permute_ps(__mmask16 __U, __m512 __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permute_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> undef, <16 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4, i32 10, i32 8, i32 8, i32 8, i32 14, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_permute_ps(__U, __X, 2);
+}
+
+__m512d test_mm512_permutevar_pd(__m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_permutevar_pd(__A, __C); 
+}
+
+__m512d test_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m512d test_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.512
+  return _mm512_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m512 test_mm512_permutevar_ps(__m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_permutevar_ps(__A, __C); 
+}
+
+__m512 test_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m512 test_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.512
+  return _mm512_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__m512i test_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi32
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.d.512
+  return _mm512_maskz_permutex2var_epi32(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, __m512i __I, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi32 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.d.512
+  return _mm512_mask_permutex2var_epi32 (__A,__U,__I,__B);
+}
+
+__m512d test_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_permutex2var_pd 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  return _mm512_permutex2var_pd (__A, __I,__B);
+}
+
+__m512d test_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_pd 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.pd.512
+  return _mm512_mask_permutex2var_pd (__A,__U,__I,__B);
+}
+
+__m512d test_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_pd
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.pd.512
+  return _mm512_maskz_permutex2var_pd(__U, __A, __I, __B); 
+}
+
+__m512 test_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_permutex2var_ps 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  return _mm512_permutex2var_ps (__A, __I, __B);
+}
+
+__m512 test_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_ps 
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.ps.512
+  return _mm512_mask_permutex2var_ps (__A,__U,__I,__B);
+}
+
+__m512 test_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_ps
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.ps.512
+  return _mm512_maskz_permutex2var_ps(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, __m512i __B){
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.q.512
+  return _mm512_mask_permutex2var_epi64(__A, __U, __I, __B);
+}
+
+__m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi64
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.512
+  return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
+}
+__mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.512
+  return _mm512_testn_epi32_mask(__A, __B); 
+}
+
+__mmask16 test_mm512_mask_testn_epi32_mask(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.512
+  return _mm512_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm512_testn_epi64_mask(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.512
+  return _mm512_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm512_mask_testn_epi64_mask(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.512
+  return _mm512_mask_testn_epi64_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_test_epi32_mask 
+  // CHECK: @llvm.x86.avx512.ptestm.d.512
+  return _mm512_mask_test_epi32_mask (__U,__A,__B);
+}
+
+__mmask8 test_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_test_epi64_mask 
+  // CHECK: @llvm.x86.avx512.ptestm.q.512
+  return _mm512_mask_test_epi64_mask (__U,__A,__B);
+}
+
+__m512i test_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpackhi_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  return _mm512_unpackhi_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpackhi_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpacklo_epi32(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  return _mm512_unpacklo_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_unpacklo_epi64(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_unpacklo_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_unpacklo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_unpacklo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_unpacklo_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m128d test_mm_roundscale_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_roundscale_round_sd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+  return _mm_roundscale_round_sd(__A, __B, 3, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_roundscale_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_roundscale_sd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+  return _mm_roundscale_sd(__A, __B, 3); 
+}
+
+__m128d test_mm_mask_roundscale_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_mask_roundscale_sd(__W,__U,__A,__B,3);
+}
+
+__m128d test_mm_mask_roundscale_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_mask_roundscale_round_sd(__W,__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_roundscale_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_maskz_roundscale_sd(__U,__A,__B,3);
+}
+
+__m128d test_mm_maskz_roundscale_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.rndscale.sd
+    return _mm_maskz_roundscale_round_sd(__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION );
+}
+
+__m128 test_mm_roundscale_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_roundscale_round_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+  return _mm_roundscale_round_ss(__A, __B, 3, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_roundscale_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+  return _mm_roundscale_ss(__A, __B, 3); 
+}
+
+__m128 test_mm_mask_roundscale_ss(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_mask_roundscale_ss(__W,__U,__A,__B,3);
+}
+
+__m128 test_mm_maskz_roundscale_round_ss( __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_roundscale_round_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_maskz_roundscale_round_ss(__U,__A,__B,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_roundscale_ss(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_roundscale_ss
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ss
+    return _mm_maskz_roundscale_ss(__U,__A,__B,3);
+}
+
+__m512d test_mm512_scalef_round_pd(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_scalef_round_pd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_scalef_round_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_mask_scalef_round_pd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_scalef_round_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_round_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_maskz_scalef_round_pd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_scalef_pd(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_scalef_pd(__A, __B); 
+}
+
+__m512d test_mm512_mask_scalef_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_mask_scalef_pd(__W, __U, __A, __B); 
+}
+
+__m512d test_mm512_maskz_scalef_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_pd
+  // CHECK: @llvm.x86.avx512.mask.scalef.pd.512
+  return _mm512_maskz_scalef_pd(__U, __A, __B); 
+}
+
+__m512 test_mm512_scalef_round_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_scalef_round_ps(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_scalef_round_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_mask_scalef_round_ps(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_scalef_round_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_round_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_maskz_scalef_round_ps(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_scalef_ps(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_scalef_ps(__A, __B); 
+}
+
+__m512 test_mm512_mask_scalef_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_mask_scalef_ps(__W, __U, __A, __B); 
+}
+
+__m512 test_mm512_maskz_scalef_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_scalef_ps
+  // CHECK: @llvm.x86.avx512.mask.scalef.ps.512
+  return _mm512_maskz_scalef_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_scalef_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef
+  return _mm_scalef_round_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_scalef_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef
+  return _mm_scalef_sd(__A, __B); 
+}
+
+__m128d test_mm_mask_scalef_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+  return _mm_mask_scalef_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_scalef_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_mask_scalef_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_scalef_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_maskz_scalef_sd(__U, __A, __B);
+}
+
+__m128d test_mm_maskz_scalef_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_round_sd
+  // CHECK: @llvm.x86.avx512.mask.scalef.sd
+    return _mm_maskz_scalef_round_sd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_scalef_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+  return _mm_scalef_round_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_scalef_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+  return _mm_scalef_ss(__A, __B); 
+}
+
+__m128 test_mm_mask_scalef_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_mask_scalef_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_scalef_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_mask_scalef_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_scalef_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_maskz_scalef_ss(__U, __A, __B);
+}
+
+__m128 test_mm_maskz_scalef_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_scalef_round_ss
+  // CHECK: @llvm.x86.avx512.mask.scalef.ss
+    return _mm_maskz_scalef_round_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_srai_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_srai_epi32(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.512
+  return _mm512_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m512i test_mm512_srai_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_srai_epi64(__A, 5); 
+}
+
+__m512i test_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m512i test_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.512
+  return _mm512_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m512i test_mm512_sll_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_sll_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_mask_sll_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi32
+  // CHECK: @llvm.x86.avx512.mask.psll.d
+  return _mm512_maskz_sll_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_sll_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_sll_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_mask_sll_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sll_epi64
+  // CHECK: @llvm.x86.avx512.mask.psll.q
+  return _mm512_maskz_sll_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_sllv_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_sllv_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv.d
+  return _mm512_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_sllv_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_sllv_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv.q
+  return _mm512_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_sra_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_sra_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d
+  return _mm512_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_sra_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_sra_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q
+  return _mm512_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_srav_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_srav_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav.d
+  return _mm512_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srav_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_srav_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q
+  return _mm512_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srl_epi32(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_srl_epi32(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d
+  return _mm512_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m512i test_mm512_srl_epi64(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_srl_epi64(__A, __B); 
+}
+
+__m512i test_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m512i test_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q
+  return _mm512_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m512i test_mm512_srlv_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_srlv_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv.d
+  return _mm512_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m512i test_mm512_srlv_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_srlv_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv.q
+  return _mm512_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m512i test_mm512_ternarylogic_epi32(__m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  return _mm512_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m512i test_mm512_mask_ternarylogic_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.512
+  return _mm512_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m512i test_mm512_maskz_ternarylogic_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.512
+  return _mm512_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m512i test_mm512_ternarylogic_epi64(__m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  return _mm512_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m512i test_mm512_mask_ternarylogic_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.512
+  return _mm512_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m512i test_mm512_maskz_ternarylogic_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) {
+  // CHECK-LABEL: @test_mm512_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.512
+  return _mm512_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+
+__m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_shuffle_f32x4(__A, __B, 4); 
+}
+
+__m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); 
+}
+
+__m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); 
+}
+
+__m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_shuffle_f64x2(__A, __B, 4); 
+}
+
+__m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); 
+}
+
+__m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); 
+}
+
+__m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_shuffle_i32x4(__A, __B, 4); 
+}
+
+__m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
+}
+
+__m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
+}
+
+__m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_shuffle_i64x2(__A, __B, 4); 
+}
+
+__m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); 
+}
+
+__m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); 
+}
+
+__m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  return _mm512_shuffle_pd(__M, __V, 4); 
+}
+
+__m512d test_mm512_mask_shuffle_pd(__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_shuffle_pd(__W, __U, __M, __V, 4); 
+}
+
+__m512d test_mm512_maskz_shuffle_pd(__mmask8 __U, __m512d __M, __m512d __V) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 3, i32 10, i32 4, i32 12, i32 6, i32 14>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_shuffle_pd(__U, __M, __V, 4); 
+}
+
+__m512 test_mm512_shuffle_ps(__m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  return _mm512_shuffle_ps(__M, __V, 4); 
+}
+
+__m512 test_mm512_mask_shuffle_ps(__m512 __W, __mmask16 __U, __m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_shuffle_ps(__W, __U, __M, __V, 4); 
+}
+
+__m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4); 
+}
+
+__m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_sqrt_round_sd
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+  return _mm_sqrt_round_sd(__A, __B, 4); 
+}
+
+__m128d test_mm_mask_sqrt_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_mask_sqrt_sd(__W,__U,__A,__B);
+}
+
+__m128d test_mm_mask_sqrt_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_mask_sqrt_round_sd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_sqrt_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_maskz_sqrt_sd(__U,__A,__B);
+}
+
+__m128d test_mm_maskz_sqrt_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.sd
+    return _mm_maskz_sqrt_round_sd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_sqrt_round_ss(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_sqrt_round_ss
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+  return _mm_sqrt_round_ss(__A, __B, 4); 
+}
+
+__m128 test_mm_mask_sqrt_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_mask_sqrt_ss(__W,__U,__A,__B);
+}
+
+__m128 test_mm_mask_sqrt_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_mask_sqrt_round_ss(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_sqrt_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_maskz_sqrt_ss(__U,__A,__B);
+}
+
+__m128 test_mm_maskz_sqrt_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK: @llvm.x86.avx512.mask.sqrt.ss
+    return _mm_maskz_sqrt_round_ss(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_broadcast_f32x4(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_broadcast_f32x4(__A); 
+}
+
+__m512 test_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_mask_broadcast_f32x4(__O, __M, __A); 
+}
+
+__m512 test_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm512_maskz_broadcast_f32x4(__M, __A); 
+}
+
+__m512d test_mm512_broadcast_f64x4(__m256d __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_broadcast_f64x4(__A); 
+}
+
+__m512d test_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_mask_broadcast_f64x4(__O, __M, __A); 
+}
+
+__m512d test_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_f64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x4
+  return _mm512_maskz_broadcast_f64x4(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i32x4(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_broadcast_i32x4(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_mask_broadcast_i32x4(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm512_maskz_broadcast_i32x4(__M, __A); 
+}
+
+__m512i test_mm512_broadcast_i64x4(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_broadcast_i64x4(__A); 
+}
+
+__m512i test_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_mask_broadcast_i64x4(__O, __M, __A); 
+}
+
+__m512i test_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcast_i64x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x4
+  return _mm512_maskz_broadcast_i64x4(__M, __A); 
+}
+
+__m512d test_mm512_broadcastsd_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  return _mm512_broadcastsd_pd(__A);
+}
+
+__m512d test_mm512_mask_broadcastsd_pd(__m512d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_broadcastsd_pd(__O, __M, __A);
+}
+
+__m512d test_mm512_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_broadcastsd_pd(__M, __A);
+}
+
+__m512 test_mm512_broadcastss_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  return _mm512_broadcastss_ps(__A);
+}
+
+__m512 test_mm512_mask_broadcastss_ps(__m512 __O, __mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m512 test_mm512_maskz_broadcastss_ps(__mmask16 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_broadcastss_ps(__M, __A);
+}
+
+__m512i test_mm512_broadcastd_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  return _mm512_broadcastd_epi32(__A);
+}
+
+__m512i test_mm512_mask_broadcastd_epi32(__m512i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastd_epi32(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m512i test_mm512_broadcastq_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  return _mm512_broadcastq_epi64(__A);
+}
+
+__m512i test_mm512_mask_broadcastq_epi64(__m512i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m512i test_mm512_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> undef, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m128i test_mm512_cvtsepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.512
+  return _mm512_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.512
+  return _mm512_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtsepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_cvtsepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtsepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtsepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.512
+  return _mm512_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.512
+  return _mm512_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtsepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.512
+  return _mm512_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtsepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_cvtsepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtsepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtsepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.512
+  return _mm512_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi32(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtsepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtsepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.512
+  return _mm512_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.512
+  return _mm512_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.512
+  return _mm512_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.512
+  return _mm512_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtusepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_cvtusepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtusepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtusepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.512
+  return _mm512_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi32_storeu_epi16(void *__P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.512
+  return _mm512_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.512
+  return _mm512_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtusepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_cvtusepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtusepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtusepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.512
+  return _mm512_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtusepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtusepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.512
+  return _mm512_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtusepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.512
+  return _mm512_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi32_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi32_epi8(__m128i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi32_epi8(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.512
+  return _mm512_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi32_storeu_epi8(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.512
+  return _mm512_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtepi32_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_cvtepi32_epi16(__A); 
+}
+
+__m256i test_mm512_mask_cvtepi32_epi16(__m256i __O, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtepi32_epi16(__mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.512
+  return _mm512_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi32_storeu_epi16(void * __P, __mmask16 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.512
+  return _mm512_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi64_epi8(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi64_epi8(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.512
+  return _mm512_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m256i test_mm512_cvtepi64_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_cvtepi64_epi32(__A); 
+}
+
+__m256i test_mm512_mask_cvtepi64_epi32(__m256i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m256i test_mm512_maskz_cvtepi64_epi32(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.512
+  return _mm512_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi32(void* __P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm512_cvtepi64_epi16(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm512_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm512_maskz_cvtepi64_epi16(__mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.512
+  return _mm512_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm512_mask_cvtepi64_storeu_epi16(void *__P, __mmask8 __M, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.512
+  return _mm512_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm512_extracti32x4_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_extracti32x4_epi32(__A, 3); 
+}
+
+__m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); 
+}
+
+__m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); 
+}
+
+__m256i test_mm512_extracti64x4_epi64(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_extracti64x4_epi64(__A, 1); 
+}
+
+__m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); 
+}
+
+__m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_extracti64x4_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x4
+  return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); 
+}
+
+__m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_insertf64x4(__A, __B, 1);
+}
+
+__m512d test_mm512_mask_insertf64x4(__m512d __W, __mmask8 __U, __m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_mask_insertf64x4(__W, __U, __A, __B, 1); 
+}
+
+__m512d test_mm512_maskz_insertf64x4(__mmask8 __U, __m512d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf64x4
+  // CHECK: @llvm.x86.avx512.mask.insertf64x4
+  return _mm512_maskz_insertf64x4(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti64x4(__m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_inserti64x4(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti64x4(__m512i __W, __mmask8 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_mask_inserti64x4(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti64x4(__mmask8 __U, __m512i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti64x4
+  // CHECK: @llvm.x86.avx512.mask.inserti64x4
+  return _mm512_maskz_inserti64x4(__U, __A, __B, 1); 
+}
+
+__m512 test_mm512_insertf32x4(__m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_insertf32x4(__A, __B, 1);
+}
+
+__m512 test_mm512_mask_insertf32x4(__m512 __W, __mmask16 __U, __m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_mask_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_mask_insertf32x4(__W, __U, __A, __B, 1); 
+}
+
+__m512 test_mm512_maskz_insertf32x4(__mmask16 __U, __m512 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm512_maskz_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm512_maskz_insertf32x4(__U, __A, __B, 1); 
+}
+
+__m512i test_mm512_inserti32x4(__m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_inserti32x4(__A, __B, 1); 
+}
+
+__m512i test_mm512_mask_inserti32x4(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_mask_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_mask_inserti32x4(__W, __U, __A, __B, 1); 
+}
+
+__m512i test_mm512_maskz_inserti32x4(__mmask16 __U, __m512i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm512_maskz_inserti32x4(__U, __A, __B, 1); 
+}
+
+__m512d test_mm512_getmant_round_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_getmant_round_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_getmant_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_mask_getmant_round_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_getmant_round_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_maskz_getmant_round_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_getmant_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_mask_getmant_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_maskz_getmant_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.512
+  return _mm512_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_getmant_round_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_getmant_round_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_getmant_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_mask_getmant_round_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_getmant_round_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_maskz_getmant_round_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_getmant_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_mask_getmant_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512 test_mm512_maskz_getmant_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.512
+  return _mm512_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m512d test_mm512_getexp_round_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_getexp_round_pd(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_getexp_round_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_mask_getexp_round_pd(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_getexp_round_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_round_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_maskz_getexp_round_pd(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_getexp_pd(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_getexp_pd(__A); 
+}
+
+__m512d test_mm512_mask_getexp_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_mask_getexp_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_getexp_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_pd
+  // CHECK: @llvm.x86.avx512.mask.getexp.pd.512
+  return _mm512_maskz_getexp_pd(__U, __A); 
+}
+
+__m512 test_mm512_getexp_round_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_getexp_round_ps(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_mask_getexp_round_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_mask_getexp_round_ps(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_maskz_getexp_round_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_round_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_maskz_getexp_round_ps(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512 test_mm512_getexp_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_getexp_ps(__A); 
+}
+
+__m512 test_mm512_mask_getexp_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_mask_getexp_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_getexp_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_getexp_ps
+  // CHECK: @llvm.x86.avx512.mask.getexp.ps.512
+  return _mm512_maskz_getexp_ps(__U, __A); 
+}
+
+__m256 test_mm512_i64gather_ps(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather.qps.512
+  return _mm512_i64gather_ps(__index, __addr, 2); 
+}
+
+__m256 test_mm512_mask_i64gather_ps(__m256 __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather.qps.512
+  return _mm512_mask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm512_i64gather_epi32(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.qpi.512
+  return _mm512_i64gather_epi32(__index, __addr, 2); 
+}
+
+__m256i test_mm512_mask_i64gather_epi32(__m256i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.qpi.512
+  return _mm512_mask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512d test_mm512_i64gather_pd(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather.qpd.512
+  return _mm512_i64gather_pd(__index, __addr, 2); 
+}
+
+__m512d test_mm512_mask_i64gather_pd(__m512d __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather.qpd.512
+  return _mm512_mask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i64gather_epi64(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.qpq.512
+  return _mm512_i64gather_epi64(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i64gather_epi64(__m512i __v1_old, __mmask8 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.qpq.512
+  return _mm512_mask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512 test_mm512_i32gather_ps(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather.dps.512
+  return _mm512_i32gather_ps(__index, __addr, 2); 
+}
+
+__m512 test_mm512_mask_i32gather_ps(__m512 v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather.dps.512
+  return _mm512_mask_i32gather_ps(v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i32gather_epi32(__m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.dpi.512
+  return _mm512_i32gather_epi32(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i32gather_epi32(__m512i __v1_old, __mmask16 __mask, __m512i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather.dpi.512
+  return _mm512_mask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512d test_mm512_i32gather_pd(__m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather.dpd.512
+  return _mm512_i32gather_pd(__index, __addr, 2); 
+}
+
+__m512d test_mm512_mask_i32gather_pd(__m512d __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather.dpd.512
+  return _mm512_mask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m512i test_mm512_i32gather_epi64(__m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.dpq.512
+  return _mm512_i32gather_epi64(__index, __addr, 2); 
+}
+
+__m512i test_mm512_mask_i32gather_epi64(__m512i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm512_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather.dpq.512
+  return _mm512_mask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+void test_mm512_i64scatter_ps(void *__addr, __m512i __index, __m256 __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.qps.512
+  return _mm512_i64scatter_ps(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_ps(void *__addr, __mmask8 __mask, __m512i __index, __m256 __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.qps.512
+  return _mm512_mask_i64scatter_ps(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_epi32(void *__addr, __m512i __index, __m256i __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.qpi.512
+  return _mm512_i64scatter_epi32(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_epi32(void *__addr, __mmask8 __mask, __m512i __index, __m256i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.qpi.512
+  return _mm512_mask_i64scatter_epi32(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_pd(void *__addr, __m512i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.qpd.512
+  return _mm512_i64scatter_pd(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_pd(void *__addr, __mmask8 __mask, __m512i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.qpd.512
+  return _mm512_mask_i64scatter_pd(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i64scatter_epi64(void *__addr, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i64scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.qpq.512
+  return _mm512_i64scatter_epi64(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i64scatter_epi64(void *__addr, __mmask8 __mask, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i64scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.qpq.512
+  return _mm512_mask_i64scatter_epi64(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_ps(void *__addr, __m512i __index, __m512 __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.dps.512
+  return _mm512_i32scatter_ps(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_ps(void *__addr, __mmask16 __mask, __m512i __index, __m512 __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatter.dps.512
+  return _mm512_mask_i32scatter_ps(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_epi32(void *__addr, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.dpi.512
+  return _mm512_i32scatter_epi32(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_epi32(void *__addr, __mmask16 __mask, __m512i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi32
+  // CHECK: @llvm.x86.avx512.scatter.dpi.512
+  return _mm512_mask_i32scatter_epi32(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_pd(void *__addr, __m256i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.dpd.512
+  return _mm512_i32scatter_pd(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_pd(void *__addr, __mmask8 __mask, __m256i __index, __m512d __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatter.dpd.512
+  return _mm512_mask_i32scatter_pd(__addr, __mask, __index, __v1, 2); 
+}
+
+void test_mm512_i32scatter_epi64(void *__addr, __m256i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_i32scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.dpq.512
+  return _mm512_i32scatter_epi64(__addr, __index, __v1, 2); 
+}
+
+void test_mm512_mask_i32scatter_epi64(void *__addr, __mmask8 __mask, __m256i __index, __m512i __v1) {
+  // CHECK-LABEL: @test_mm512_mask_i32scatter_epi64
+  // CHECK: @llvm.x86.avx512.scatter.dpq.512
+  return _mm512_mask_i32scatter_epi64(__addr, __mask, __index, __v1, 2); 
+}
+
+__m128d test_mm_mask_rsqrt14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_sd
+  // CHECK: @llvm.x86.avx512.rsqrt14.sd
+  return _mm_mask_rsqrt14_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_maskz_rsqrt14_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_sd
+  // CHECK: @llvm.x86.avx512.rsqrt14.sd
+  return _mm_maskz_rsqrt14_sd(__U, __A, __B);
+}
+
+__m128 test_mm_mask_rsqrt14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_ss
+  // CHECK: @llvm.x86.avx512.rsqrt14.ss
+  return _mm_mask_rsqrt14_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_rsqrt14_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_ss
+  // CHECK: @llvm.x86.avx512.rsqrt14.ss
+  return _mm_maskz_rsqrt14_ss(__U, __A, __B);
+}
+
+__m512d test_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rcp14_pd 
+  // CHECK: @llvm.x86.avx512.rcp14.pd.512
+  return _mm512_mask_rcp14_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rcp14_pd 
+  // CHECK: @llvm.x86.avx512.rcp14.pd.512
+  return _mm512_maskz_rcp14_pd (__U,__A);
+}
+
+__m512 test_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_rcp14_ps 
+  // CHECK: @llvm.x86.avx512.rcp14.ps.512
+  return _mm512_mask_rcp14_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_rcp14_ps 
+  // CHECK: @llvm.x86.avx512.rcp14.ps.512
+  return _mm512_maskz_rcp14_ps (__U,__A);
+}
+
+__m128d test_mm_mask_rcp14_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_rcp14_sd
+  // CHECK: @llvm.x86.avx512.rcp14.sd
+  return _mm_mask_rcp14_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_maskz_rcp14_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_rcp14_sd
+  // CHECK: @llvm.x86.avx512.rcp14.sd
+  return _mm_maskz_rcp14_sd(__U, __A, __B);
+}
+
+__m128 test_mm_mask_rcp14_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_rcp14_ss
+  // CHECK: @llvm.x86.avx512.rcp14.ss
+  return _mm_mask_rcp14_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_maskz_rcp14_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_rcp14_ss
+  // CHECK: @llvm.x86.avx512.rcp14.ss
+  return _mm_maskz_rcp14_ss(__U, __A, __B);
+}
+
+__m128d test_mm_mask_getexp_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_mask_getexp_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_getexp_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_mask_getexp_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_getexp_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_maskz_getexp_sd(__U, __A, __B);
+}
+
+__m128d test_mm_maskz_getexp_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getexp.sd
+  return _mm_maskz_getexp_round_sd(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_getexp_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_mask_getexp_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_getexp_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_mask_getexp_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_getexp_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_maskz_getexp_ss(__U, __A, __B);
+}
+
+__m128 test_mm_maskz_getexp_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getexp_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getexp.ss
+  return _mm_maskz_getexp_round_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_getmant_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_mask_getmant_sd(__W, __U, __A, __B, 1, 2);
+}
+
+__m128d test_mm_mask_getmant_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_mask_getmant_round_sd(__W, __U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_getmant_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_maskz_getmant_sd(__U, __A, __B, 1, 2);
+}
+
+__m128d test_mm_maskz_getmant_round_sd(__mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_round_sd
+  // CHECK: @llvm.x86.avx512.mask.getmant.sd
+  return _mm_maskz_getmant_round_sd(__U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_getmant_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_mask_getmant_ss(__W, __U, __A, __B, 1, 2);
+}
+
+__m128 test_mm_mask_getmant_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_mask_getmant_round_ss(__W, __U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_getmant_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_maskz_getmant_ss(__U, __A, __B, 1, 2);
+}
+
+__m128 test_mm_maskz_getmant_round_ss(__mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_maskz_getmant_round_ss
+  // CHECK: @llvm.x86.avx512.mask.getmant.ss
+  return _mm_maskz_getmant_round_ss(__U, __A, __B, 1, 2, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmadd_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmadd_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmadd_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmsub_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmsub_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmsub_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fnmadd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmadd_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fnmadd_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmadd_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fnmadd_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmadd_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fnmadd_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmadd_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fnmadd_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmadd_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fnmadd_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmadd_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask_fnmsub_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmsub_ss(__W, __U, __A, __B);
+}
+
+__m128 test_mm_mask_fnmsub_round_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.ss
+  return _mm_mask_fnmsub_round_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_maskz_fnmsub_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmsub_ss(__U, __A, __B, __C);
+}
+
+__m128 test_mm_maskz_fnmsub_round_ss(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.ss
+  return _mm_maskz_fnmsub_round_ss(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128 test_mm_mask3_fnmsub_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmsub_ss(__W, __X, __Y, __U);
+}
+
+__m128 test_mm_mask3_fnmsub_round_ss(__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_ss
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.ss
+  return _mm_mask3_fnmsub_round_ss(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmadd_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmadd_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmadd_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmsub_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmsub_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmsub_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fnmadd_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmadd_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fnmadd_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmadd_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fnmadd_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmadd_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fnmadd_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmadd_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fnmadd_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmadd_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fnmadd_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmadd_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmadd_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask_fnmsub_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmsub_sd(__W, __U, __A, __B);
+}
+
+__m128d test_mm_mask_fnmsub_round_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B){
+  // CHECK-LABEL: @test_mm_mask_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask.vfmadd.sd
+  return _mm_mask_fnmsub_round_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_maskz_fnmsub_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmsub_sd(__U, __A, __B, __C);
+}
+
+__m128d test_mm_maskz_fnmsub_round_sd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C){
+  // CHECK-LABEL: @test_mm_maskz_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.maskz.vfmadd.sd
+  return _mm_maskz_fnmsub_round_sd(__U, __A, __B, __C, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128d test_mm_mask3_fnmsub_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmsub_sd(__W, __X, __Y, __U);
+}
+
+__m128d test_mm_mask3_fnmsub_round_sd(__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U){
+  // CHECK-LABEL: @test_mm_mask3_fnmsub_round_sd
+  // CHECK: @llvm.x86.avx512.mask3.vfmadd.sd
+  return _mm_mask3_fnmsub_round_sd(__W, __X, __Y, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_permutex_pd(__m512d __X) {
+  // CHECK-LABEL: @test_mm512_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  return _mm512_permutex_pd(__X, 0);
+}
+
+__m512d test_mm512_mask_permutex_pd(__m512d __W, __mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_mask_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_permutex_pd(__W, __U, __X, 0);
+}
+
+__m512d test_mm512_maskz_permutex_pd(__mmask8 __U, __m512d __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex_pd
+  // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_permutex_pd(__U, __X, 0);
+}
+
+__m512i test_mm512_permutex_epi64(__m512i __X) {
+  // CHECK-LABEL: @test_mm512_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  return _mm512_permutex_epi64(__X, 0);
+}
+
+__m512i test_mm512_mask_permutex_epi64(__m512i __W, __mmask8 __M, __m512i __X) {
+  // CHECK-LABEL: @test_mm512_mask_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_permutex_epi64(__W, __M, __X, 0);
+}
+
+__m512i test_mm512_maskz_permutex_epi64(__mmask8 __M, __m512i __X) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex_epi64
+  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_permutex_epi64(__M, __X, 0);
+}
+
+__m512d test_mm512_permutexvar_pd(__m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_permutexvar_pd(__X, __Y); 
+}
+
+__m512d test_mm512_mask_permutexvar_pd(__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_mask_permutexvar_pd(__W, __U, __X, __Y); 
+}
+
+__m512d test_mm512_maskz_permutexvar_pd(__mmask8 __U, __m512i __X, __m512d __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.512
+  return _mm512_maskz_permutexvar_pd(__U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi64(__mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_maskz_permutexvar_epi64(__M, __X, __Y); 
+}
+
+__m512i test_mm512_permutexvar_epi64(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_permutexvar_epi64(__X, __Y); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.512
+  return _mm512_mask_permutexvar_epi64(__W, __M, __X, __Y); 
+}
+
+__m512 test_mm512_permutexvar_ps(__m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_permutexvar_ps(__X, __Y); 
+}
+
+__m512 test_mm512_mask_permutexvar_ps(__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_mask_permutexvar_ps(__W, __U, __X, __Y); 
+}
+
+__m512 test_mm512_maskz_permutexvar_ps(__mmask16 __U, __m512i __X, __m512 __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.512
+  return _mm512_maskz_permutexvar_ps(__U, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi32(__mmask16 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_maskz_permutexvar_epi32(__M, __X, __Y); 
+}
+
+__m512i test_mm512_permutexvar_epi32(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_permutexvar_epi32(__X, __Y); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi32(__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.512
+  return _mm512_mask_permutexvar_epi32(__W, __M, __X, __Y); 
+}
+
+__mmask16 test_mm512_kand(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kand
+  // CHECK: @llvm.x86.avx512.kand.w
+  return _mm512_kand(__A, __B); 
+}
+
+__mmask16 test_mm512_kandn(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kandn
+  // CHECK: @llvm.x86.avx512.kandn.w
+  return _mm512_kandn(__A, __B); 
+}
+
+__mmask16 test_mm512_kor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kor
+  // CHECK: @llvm.x86.avx512.kor.w
+  return _mm512_kor(__A, __B); 
+}
+
+int test_mm512_kortestc(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kortestc
+  // CHECK: @llvm.x86.avx512.kortestc.w
+  return _mm512_kortestc(__A, __B); 
+}
+
+int test_mm512_kortestz(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kortestz
+  // CHECK: @llvm.x86.avx512.kortestz.w
+  return _mm512_kortestz(__A, __B); 
+}
+
+__mmask16 test_mm512_kunpackb(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kunpackb
+  // CHECK: @llvm.x86.avx512.kunpck.bw
+  return _mm512_kunpackb(__A, __B); 
+}
+
+__mmask16 test_mm512_kxnor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kxnor
+  // CHECK: @llvm.x86.avx512.kxnor.w
+  return _mm512_kxnor(__A, __B); 
+}
+
+__mmask16 test_mm512_kxor(__mmask16 __A, __mmask16 __B) {
+  // CHECK-LABEL: @test_mm512_kxor
+  // CHECK: @llvm.x86.avx512.kxor.w
+  return _mm512_kxor(__A, __B); 
+}
+
+void test_mm512_stream_si512(__m512i * __P, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_stream_si512
+  // CHECK: store <8 x i64> %{{.*}}, <8 x i64>* %{{.*}}, align 64, !nontemporal
+  _mm512_stream_si512(__P, __A); 
+}
+
+__m512i test_mm512_stream_load_si512(void *__P) {
+  // CHECK-LABEL: @test_mm512_stream_load_si512
+  // CHECK: @llvm.x86.avx512.movntdqa
+  return _mm512_stream_load_si512(__P); 
+}
+
+void test_mm512_stream_pd(double *__P, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_stream_pd
+  // CHECK: store <8 x double> %{{.*}}, <8 x double>* %{{.*}}, align 64, !nontemporal
+  return _mm512_stream_pd(__P, __A); 
+}
+
+void test_mm512_stream_ps(float *__P, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_stream_ps
+  // CHECK: store <16 x float> %{{.*}}, <16 x float>* %{{.*}}, align 64, !nontemporal
+  _mm512_stream_ps(__P, __A); 
+}
+
+__m512d test_mm512_mask_compress_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.pd.512
+  return _mm512_mask_compress_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_compress_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.pd.512
+  return _mm512_maskz_compress_pd(__U, __A); 
+}
+
+__m512i test_mm512_mask_compress_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.q.512
+  return _mm512_mask_compress_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_compress_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.q.512
+  return _mm512_maskz_compress_epi64(__U, __A); 
+}
+
+__m512 test_mm512_mask_compress_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.ps.512
+  return _mm512_mask_compress_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_compress_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.ps.512
+  return _mm512_maskz_compress_ps(__U, __A); 
+}
+
+__m512i test_mm512_mask_compress_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compress_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.d.512
+  return _mm512_mask_compress_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_compress_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_compress_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.d.512
+  return _mm512_maskz_compress_epi32(__U, __A); 
+}
+
+__mmask8 test_mm_cmp_round_ss_mask(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_cmp_round_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_round_ss_mask(__X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_mask_cmp_round_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_round_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_round_ss_mask(__M, __X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_cmp_ss_mask(__m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_cmp_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_ss_mask(__X, __Y, 5); 
+}
+
+__mmask8 test_mm_mask_cmp_ss_mask(__mmask8 __M, __m128 __X, __m128 __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ss_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_ss_mask(__M, __X, __Y, 5); 
+}
+
+__mmask8 test_mm_cmp_round_sd_mask(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_cmp_round_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_round_sd_mask(__X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_mask_cmp_round_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_round_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_round_sd_mask(__M, __X, __Y, 5, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__mmask8 test_mm_cmp_sd_mask(__m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_cmp_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_cmp_sd_mask(__X, __Y, 5); 
+}
+
+__mmask8 test_mm_mask_cmp_sd_mask(__mmask8 __M, __m128d __X, __m128d __Y) {
+  // CHECK-LABEL: @test_mm_mask_cmp_sd_mask
+  // CHECK: @llvm.x86.avx512.mask.cmp
+  return _mm_mask_cmp_sd_mask(__M, __X, __Y, 5); 
+}
+
+__m512 test_mm512_movehdup_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  return _mm512_movehdup_ps(__A);
+}
+
+__m512 test_mm512_mask_movehdup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m512 test_mm512_maskz_movehdup_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_movehdup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_movehdup_ps(__U, __A);
+}
+
+__m512 test_mm512_moveldup_ps(__m512 __A) {
+  // CHECK-LABEL: @test_mm512_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  return _mm512_moveldup_ps(__A);
+}
+
+__m512 test_mm512_mask_moveldup_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m512 test_mm512_maskz_moveldup_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_moveldup_ps
+  // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_moveldup_ps(__U, __A);
+}
+
+__m512i test_mm512_shuffle_epi32(__m512i __A) {
+  // CHECK-LABEL: @test_mm512_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  return _mm512_shuffle_epi32(__A, 1); 
+}
+
+__m512i test_mm512_mask_shuffle_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shuffle_epi32(__W, __U, __A, 1); 
+}
+
+__m512i test_mm512_maskz_shuffle_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_shuffle_epi32
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 1, i32 0, i32 0, i32 0, i32 5, i32 4, i32 4, i32 4, i32 9, i32 8, i32 8, i32 8, i32 13, i32 12, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shuffle_epi32(__U, __A, 1); 
+}
+
+__m512d test_mm512_mask_expand_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.pd.512
+  return _mm512_mask_expand_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_expand_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.pd.512
+  return _mm512_maskz_expand_pd(__U, __A); 
+}
+
+__m512i test_mm512_mask_expand_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.q.512
+  return _mm512_mask_expand_epi64(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_expand_epi64(__mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.q.512
+  return _mm512_maskz_expand_epi64(__U, __A); 
+}
+__m512i test_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  return _mm512_mask_expandloadu_epi64(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi64
+  // CHECK: @llvm.x86.avx512.mask.expand.load.q.512
+  return _mm512_maskz_expandloadu_epi64(__U, __P); 
+}
+
+__m512d test_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  return _mm512_mask_expandloadu_pd(__W, __U, __P); 
+}
+
+__m512d test_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_pd
+  // CHECK: @llvm.x86.avx512.mask.expand.load.pd.512
+  return _mm512_maskz_expandloadu_pd(__U, __P); 
+}
+
+__m512i test_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_mask_expandloadu_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  return _mm512_mask_expandloadu_epi32(__W, __U, __P); 
+}
+
+__m512i test_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm512_maskz_expandloadu_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.load.d.512
+  return _mm512_maskz_expandloadu_epi32(__U, __P); 
+}
+
+__m512 test_mm512_mask_expand_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_ps
+  // CHECK: @llvm.x86.avx512.mask.expand.ps.512
+  return _mm512_mask_expand_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_expand_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_ps
+  // CHECK: @llvm.x86.avx512.mask.expand.ps.512
+  return _mm512_maskz_expand_ps(__U, __A); 
+}
+
+__m512i test_mm512_mask_expand_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_expand_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.d.512
+  return _mm512_mask_expand_epi32(__W, __U, __A); 
+}
+
+__m512i test_mm512_maskz_expand_epi32(__mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_maskz_expand_epi32
+  // CHECK: @llvm.x86.avx512.mask.expand.d.512
+  return _mm512_maskz_expand_epi32(__U, __A); 
+}
+__m512d test_mm512_cvt_roundps_pd(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_cvt_roundps_pd(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_mask_cvt_roundps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_mask_cvt_roundps_pd(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_maskz_cvt_roundps_pd(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvt_roundps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_maskz_cvt_roundps_pd(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m512d test_mm512_cvtps_pd(__m256 __A) {
+  // CHECK-LABEL: @test_mm512_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_cvtps_pd(__A); 
+}
+
+__m512d test_mm512_mask_cvtps_pd(__m512d __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_mask_cvtps_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_pd
+  // CHECK: @llvm.x86.avx512.mask.cvtps2pd.512
+  return _mm512_maskz_cvtps_pd(__U, __A); 
+}
+__m512d test_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_pd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_mask_mov_pd(__W, __U, __A); 
+}
+
+__m512d test_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_pd
+  // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}}
+  return _mm512_maskz_mov_pd(__U, __A); 
+}
+
+__m512 test_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_mov_ps
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_mask_mov_ps(__W, __U, __A); 
+}
+
+__m512 test_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_maskz_mov_ps
+  // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
+  return _mm512_maskz_mov_ps(__U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_pd
+  // CHECK: @llvm.x86.avx512.mask.compress.store.pd.512
+  return _mm512_mask_compressstoreu_pd(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_epi64(void *__P, __mmask8 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi64
+  // CHECK: @llvm.x86.avx512.mask.compress.store.q.512
+  return _mm512_mask_compressstoreu_epi64(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_ps(void *__P, __mmask16 __U, __m512 __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_ps
+  // CHECK: @llvm.x86.avx512.mask.compress.store.ps.512
+  return _mm512_mask_compressstoreu_ps(__P, __U, __A); 
+}
+
+void test_mm512_mask_compressstoreu_epi32(void *__P, __mmask16 __U, __m512i __A) {
+  // CHECK-LABEL: @test_mm512_mask_compressstoreu_epi32
+  // CHECK: @llvm.x86.avx512.mask.compress.store.d.512
+  return _mm512_mask_compressstoreu_epi32(__P, __U, __A); 
+}
+
+__m256i test_mm512_cvtt_roundpd_epu32(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_cvtt_roundpd_epu32(__A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_mask_cvtt_roundpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_mask_cvtt_roundpd_epu32(__W, __U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_maskz_cvtt_roundpd_epu32(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvtt_roundpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_maskz_cvtt_roundpd_epu32(__U, __A, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m256i test_mm512_cvttpd_epu32(__m512d __A) {
+  // CHECK-LABEL: @test_mm512_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_cvttpd_epu32(__A); 
+}
+
+__m256i test_mm512_mask_cvttpd_epu32(__m256i __W, __mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_mask_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_mask_cvttpd_epu32(__W, __U, __A); 
+}
+
+__m256i test_mm512_maskz_cvttpd_epu32(__mmask8 __U, __m512d __A) {
+  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2udq.512
+  return _mm512_maskz_cvttpd_epu32(__U, __A); 
+}
+
+__m512 test_mm512_castpd_ps (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd_ps 
+  // CHECK: bitcast <8 x double> %{{.}} to <16 x float>
+  return _mm512_castpd_ps (__A);
+}
+
+__m512d test_mm512_castps_pd (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps_pd 
+  // CHECK: bitcast <16 x float> %{{.}} to <8 x double>
+  return _mm512_castps_pd (__A);
+}
+
+__m512i test_mm512_castpd_si512 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd_si512 
+  // CHECK: bitcast <8 x double> %{{.}} to <8 x i64>
+  return _mm512_castpd_si512 (__A);
+}
+
+__m512 test_mm512_castps128_ps512(__m128 __A) {
+  // CHECK-LABEL: @test_mm512_castps128_ps512
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castps128_ps512(__A); 
+}
+
+__m512d test_mm512_castpd128_pd512(__m128d __A) {
+  // CHECK-LABEL: @test_mm512_castpd128_pd512
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castpd128_pd512(__A); 
+}
+
+__m512d test_mm512_set1_epi8(char d)
+{
+  // CHECK-LABEL: @test_mm512_set1_epi8
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 0
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 1
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 2
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 3
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 4
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 5
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 6
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 7
+  // CHECK: insertelement <64 x i8> {{.*}}, i32 63
+  return _mm512_set1_epi8(d);
+}
+
+__m512d test_mm512_set1_epi16(short d)
+{
+  // CHECK-LABEL: @test_mm512_set1_epi16
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 0
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 1
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 2
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 3
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 4
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 5
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 6
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 7
+  // CHECK: insertelement <32 x i16> {{.*}}, i32 31
+  return _mm512_set1_epi16(d);
+}
+
+__m512i test_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_epi32 
+  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
+  return _mm512_set4_epi32 (__A,__B,__C,__D);
+}
+
+__m512i test_mm512_set4_epi64 (long long __A, long long __B, long long __C, long long __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_epi64 
+  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
+  return _mm512_set4_epi64 (__A,__B,__C,__D);
+}
+
+__m512d test_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_pd 
+  // CHECK: insertelement <8 x double> {{.*}}, i32 7
+  return _mm512_set4_pd (__A,__B,__C,__D);
+}
+
+__m512 test_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  // CHECK-LABEL: @test_mm512_set4_ps 
+  // CHECK: insertelement <16 x float> {{.*}}, i32 15
+  return _mm512_set4_ps (__A,__B,__C,__D);
+}
+
+__m512i test_mm512_setr4_epi32(int e0, int e1, int e2, int e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_epi32
+  // CHECK: insertelement <16 x i32> {{.*}}, i32 15
+  return _mm512_setr4_epi32(e0, e1, e2, e3);
+}
+
+ __m512i test_mm512_setr4_epi64(long long e0, long long e1, long long e2, long long e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_epi64
+  // CHECK: insertelement <8 x i64> {{.*}}, i32 7
+  return _mm512_setr4_epi64(e0, e1, e2, e3);
+}
+
+__m512i test_mm512_setr4_pd(double e0, double e1, double e2, double e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_pd
+  // CHECK: insertelement <8 x double> {{.*}}, i32 7
+  return _mm512_setr4_pd(e0,e1,e2,e3);
+}
+
+ __m512i test_mm512_setr4_ps(float e0, float e1, float e2, float e3)
+{
+  // CHECK-LABEL: @test_mm512_setr4_ps
+  // CHECK: insertelement <16 x float> {{.*}}, i32 15
+  return _mm512_setr4_ps(e0,e1,e2,e3);
+}
+
+__m512d test_mm512_castpd256_pd512(__m256d a)
+{
+  // CHECK-LABEL: @test_mm512_castpd256_pd512
+  // CHECK: shufflevector <4 x double> {{.*}} <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castpd256_pd512(a);
+}
+
+__m256d test_mm512_castpd512_pd256 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_castpd512_pd256 
+  // CHECK: shufflevector <8 x double> %{{.}}, <8 x double> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm512_castpd512_pd256 (__A);
+}
+
+__m256 test_mm512_castps512_ps256 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps512_ps256 
+  // CHECK: shufflevector <16 x float> %{{.}}, <16 x float> %{{.}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  return _mm512_castps512_ps256 (__A);
+}
+
+__m512i test_mm512_castps_si512 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_castps_si512 
+  // CHECK: bitcast <16 x float> %{{.}} to <8 x i64>
+  return _mm512_castps_si512 (__A);
+}
+__m512i test_mm512_castsi128_si512(__m128i __A) {
+  // CHECK-LABEL: @test_mm512_castsi128_si512
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castsi128_si512(__A); 
+}
+
+__m512i test_mm512_castsi256_si512(__m256i __A) {
+  // CHECK-LABEL: @test_mm512_castsi256_si512
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  return _mm512_castsi256_si512(__A); 
+}
+
+__m512 test_mm512_castsi512_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_ps 
+  // CHECK: bitcast <8 x i64> %{{.}} to <16 x float>
+  return _mm512_castsi512_ps (__A);
+}
+
+__m512d test_mm512_castsi512_pd (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_pd 
+  // CHECK: bitcast <8 x i64> %{{.}} to <8 x double>
+  return _mm512_castsi512_pd (__A);
+}
+
+__m128i test_mm512_castsi512_si128 (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_si128 
+  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <2 x i32> <i32 0, i32 1>
+  return _mm512_castsi512_si128 (__A);
+}
+
+__m256i test_mm512_castsi512_si256 (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_castsi512_si256 
+  // CHECK: shufflevector <8 x i64> %{{.}}, <8 x i64> %{{.}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  return _mm512_castsi512_si256 (__A);
+}
+
+__m128 test_mm_cvt_roundsd_ss(__m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_cvt_roundsd_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_mask_cvt_roundsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_mask_cvt_roundsd_ss(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_maskz_cvt_roundsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_maskz_cvt_roundsd_ss(__U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvt_roundi64_sd(__m128d __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi64_sd
+  // CHECK: @llvm.x86.avx512.cvtsi2sd64
+  return _mm_cvt_roundi64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvt_roundsi64_sd(__m128d __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi64_sd
+  // CHECK: @llvm.x86.avx512.cvtsi2sd64
+  return _mm_cvt_roundsi64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundsi32_ss(__m128 __A, int __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi32_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss32
+  return _mm_cvt_roundsi32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundi32_ss(__m128 __A, int __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi32_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss32
+  return _mm_cvt_roundi32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundsi64_ss(__m128 __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundsi64_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss64
+  return _mm_cvt_roundsi64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvt_roundi64_ss(__m128 __A, long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundi64_ss
+  // CHECK: @llvm.x86.avx512.cvtsi2ss64
+  return _mm_cvt_roundi64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvt_roundss_sd(__m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_cvt_roundss_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_mask_cvt_roundss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_mask_cvt_roundss_sd(__W, __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_maskz_cvt_roundss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_maskz_cvt_roundss_sd( __U, __A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvtu32_sd(__m128d __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvtu32_sd
+  // CHECK: @llvm.x86.avx512.cvtusi2sd
+  return _mm_cvtu32_sd(__A, __B); 
+}
+
+__m128d test_mm_cvt_roundu64_sd(__m128d __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu64_sd
+  // CHECK: @llvm.x86.avx512.cvtusi642sd
+  return _mm_cvt_roundu64_sd(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128d test_mm_cvtu64_sd(__m128d __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvtu64_sd
+  // CHECK: @llvm.x86.avx512.cvtusi642sd
+  return _mm_cvtu64_sd(__A, __B); 
+}
+
+__m128 test_mm_cvt_roundu32_ss(__m128 __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu32_ss
+  // CHECK: @llvm.x86.avx512.cvtusi2ss
+  return _mm_cvt_roundu32_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvtu32_ss(__m128 __A, unsigned __B) {
+  // CHECK-LABEL: @test_mm_cvtu32_ss
+  // CHECK: @llvm.x86.avx512.cvtusi2ss
+  return _mm_cvtu32_ss(__A, __B); 
+}
+
+__m128 test_mm_cvt_roundu64_ss(__m128 __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvt_roundu64_ss
+  // CHECK: @llvm.x86.avx512.cvtusi642ss
+    return _mm_cvt_roundu64_ss(__A, __B, _MM_FROUND_CUR_DIRECTION); 
+}
+
+__m128 test_mm_cvtu64_ss(__m128 __A, unsigned long long __B) {
+  // CHECK-LABEL: @test_mm_cvtu64_ss
+  // CHECK: @llvm.x86.avx512.cvtusi642ss
+  return _mm_cvtu64_ss(__A, __B); 
+}
+
+__m512i test_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttps_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+  return _mm512_mask_cvttps_epu32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttps_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2udq.512
+  return _mm512_maskz_cvttps_epu32 (__U,__A);
+}
+
+__m512 test_mm512_cvtepu32_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepu32_ps 
+  // CHECK:  @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_cvtepu32_ps (__A);
+}
+
+__m512 test_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_mask_cvtepu32_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2ps.512
+  return _mm512_maskz_cvtepu32_ps (__U,__A);
+}
+
+__m512d test_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  return _mm512_mask_cvtepi32_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2pd.512
+  return _mm512_maskz_cvtepi32_pd (__U,__A);
+}
+
+__m512 test_mm512_cvtepi32_ps (__m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtepi32_ps 
+  // CHECK:  @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_cvtepi32_ps (__A);
+}
+
+__m512 test_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepi32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_mask_cvtepi32_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepi32_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtdq2ps.512
+  return _mm512_maskz_cvtepi32_ps (__U,__A);
+}
+
+__m512d test_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtepu32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  return _mm512_mask_cvtepu32_pd (__W,__U,__A);
+}
+
+__m512d test_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtepu32_pd 
+  // CHECK: @llvm.x86.avx512.mask.cvtudq2pd.512
+  return _mm512_maskz_cvtepu32_pd (__U,__A);
+}
+
+__m256 test_mm512_cvtpd_ps (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_cvtpd_ps (__A);
+}
+
+__m256 test_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_mask_cvtpd_ps (__W,__U,__A);
+}
+
+__m256 test_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_ps 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2ps.512
+  return _mm512_maskz_cvtpd_ps (__U,__A);
+}
+
+__m512 test_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtph_ps 
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  return _mm512_mask_cvtph_ps (__W,__U,__A);
+}
+
+__m512 test_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtph_ps 
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.512
+  return _mm512_maskz_cvtph_ps (__U,__A);
+}
+
+__m256i test_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_mask_cvttpd_epi32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttpd2dq.512
+  return _mm512_maskz_cvttpd_epi32 (__U,__A);
+}
+
+__m512i test_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvttps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_mask_cvttps_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvttps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvttps2dq.512
+  return _mm512_maskz_cvttps_epi32 (__U,__A);
+}
+
+__m512i test_mm512_cvtps_epi32 (__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_cvtps_epi32 (__A);
+}
+
+__m512i test_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_mask_cvtps_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtps2dq.512
+  return _mm512_maskz_cvtps_epi32 (__U,__A);
+}
+
+__m256i test_mm512_cvtpd_epi32 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_cvtpd_epi32 (__A);
+}
+
+__m256i test_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_mask_cvtpd_epi32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epi32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2dq.512
+  return _mm512_maskz_cvtpd_epi32 (__U,__A);
+}
+
+__m256i test_mm512_cvtpd_epu32 (__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_cvtpd_epu32 (__A);
+}
+
+__m256i test_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_mask_cvtpd_epu32 (__W,__U,__A);
+}
+
+__m256i test_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtpd_epu32 
+  // CHECK: @llvm.x86.avx512.mask.cvtpd2udq.512
+  return _mm512_maskz_cvtpd_epu32 (__U,__A);
+}
+
+__m256i test_mm512_mask_cvtps_ph(__m256i src, __mmask16 k, __m512 a) 
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+  return _mm512_mask_cvtps_ph(src, k, a,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m256i test_mm512_maskz_cvtps_ph (__mmask16 k, __m512 a) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.512
+  return _mm512_maskz_cvtps_ph( k, a,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_cvtps_epu32 ( __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_cvtps_epu32(__A);
+}
+
+__m512i test_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_mask_cvtps_epu32( __W, __U, __A);
+}
+__m512i test_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_cvtps_epu32
+  // CHECK: @llvm.x86.avx512.mask.cvtps2udq.512
+  return _mm512_maskz_cvtps_epu32( __U, __A);
+}
+
+__m512d test_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_pd 
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_mask_max_pd (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_pd 
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_maskz_max_pd (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_ps 
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_mask_max_ps (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_mask_max_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_mask_max_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_max_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_maskz_max_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_max_round_pd(__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_max_round_pd
+  // CHECK: @llvm.x86.avx512.mask.max.pd.512
+  return _mm512_max_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_ps 
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_maskz_max_ps (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_max_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_mask_max_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_max_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_maskz_max_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_max_round_ps(__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_max_round_ps
+  // CHECK: @llvm.x86.avx512.mask.max.ps.512
+  return _mm512_max_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_pd 
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_mask_min_pd (__W,__U,__A,__B);
+}
+
+__m512d test_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_pd 
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_maskz_min_pd (__U,__A,__B);
+}
+
+__m512d test_mm512_mask_min_round_pd(__m512d __W,__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_mask_min_round_pd(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_min_round_pd(__mmask8 __U,__m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_maskz_min_round_pd(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_min_round_pd( __m512d __A,__m512d __B)
+{
+  // CHECK-LABEL: @test_mm512_min_round_pd
+  // CHECK: @llvm.x86.avx512.mask.min.pd.512
+  return _mm512_min_round_pd(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_ps 
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_mask_min_ps (__W,__U,__A,__B);
+}
+
+__m512 test_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_ps 
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_maskz_min_ps (__U,__A,__B);
+}
+
+__m512 test_mm512_mask_min_round_ps(__m512 __W,__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_mask_min_round_ps(__W,__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_min_round_ps(__mmask16 __U,__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_maskz_min_round_ps(__U,__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_min_round_ps(__m512 __A,__m512 __B)
+{
+  // CHECK-LABEL: @test_mm512_min_round_ps
+  // CHECK: @llvm.x86.avx512.mask.min.ps.512
+  return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_floor_ps 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_floor_ps (__W,__U,__A);
+}
+
+__m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_floor_pd 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_floor_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_ceil_ps 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_ceil_ps (__W,__U,__A);
+}
+
+__m512d test_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_ceil_pd 
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_ceil_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_mask_roundscale_ps(__m512 __W, __mmask16 __U, __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_roundscale_ps(__W,__U,__A, 1);
+}
+
+__m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_maskz_roundscale_ps(__U,__A, 1);
+}
+
+__m512 test_mm512_mask_roundscale_round_ps(__m512 __A,__mmask16 __U,__m512 __C)
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_mask_roundscale_round_ps(__A,__U,__C,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_maskz_roundscale_round_ps(__m512 __A,__mmask16 __U) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_maskz_roundscale_round_ps(__U,__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512 test_mm512_roundscale_round_ps(__m512 __A)
+{
+  // CHECK-LABEL: @test_mm512_roundscale_round_ps
+  // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512
+  return _mm512_roundscale_round_ps(__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_mask_roundscale_pd(__m512d __W, __mmask8 __U, __m512d __A) 
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_roundscale_pd(__W,__U,__A, 1);
+}
+
+__m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A) 
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_maskz_roundscale_pd(__U,__A, 1);
+}
+
+__m512d test_mm512_mask_roundscale_round_pd(__m512d __A,__mmask8 __U,__m512d __C)
+{
+  // CHECK-LABEL: @test_mm512_mask_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_mask_roundscale_round_pd(__A,__U,__C,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_maskz_roundscale_round_pd(__m512d __A,__mmask8 __U)
+{
+  // CHECK-LABEL: @test_mm512_maskz_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_maskz_roundscale_round_pd(__U,__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512d test_mm512_roundscale_round_pd(__m512d __A)
+{
+  // CHECK-LABEL: @test_mm512_roundscale_round_pd
+  // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512
+  return _mm512_roundscale_round_pd(__A,3,_MM_FROUND_CUR_DIRECTION);
+}
+
+__m512i test_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  return _mm512_mask_max_epi32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.d.512
+  return _mm512_maskz_max_epi32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  return _mm512_mask_max_epi64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxs.q.512
+  return _mm512_maskz_max_epi64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  return _mm512_mask_max_epu64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.q.512
+  return _mm512_maskz_max_epu64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_max_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  return _mm512_mask_max_epu32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_max_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pmaxu.d.512
+  return _mm512_maskz_max_epu32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  return _mm512_mask_min_epi32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pmins.d.512
+  return _mm512_maskz_min_epi32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  return _mm512_mask_min_epu32 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epu32 
+  // CHECK: @llvm.x86.avx512.mask.pminu.d.512
+  return _mm512_maskz_min_epu32 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  return _mm512_mask_min_epi64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pmins.q.512
+  return _mm512_maskz_min_epi64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_mask_min_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  return _mm512_mask_min_epu64 (__W,__M,__A,__B);
+}
+
+__m512i test_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  // CHECK-LABEL: @test_mm512_maskz_min_epu64 
+  // CHECK: @llvm.x86.avx512.mask.pminu.q.512
+  return _mm512_maskz_min_epu64 (__M,__A,__B);
+}
+
+__m512i test_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+    //CHECK-LABEL: @test_mm512_mask_set1_epi32
+    //CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.512
+  return _mm512_mask_set1_epi32 ( __O, __M, __A);
+}
+
+__m512i test_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+               int __E, int __F, int __G, int __H,
+               int __I, int __J, int __K, int __L,
+               int __M, int __N, int __O, int __P)
+{
+ //CHECK-LABEL: @test_mm512_set_epi32
+ //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+ return _mm512_set_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
+              __I, __J, __K, __L,__M, __N, __O, __P);
+}
+
+__m512i test_mm512_setr_epi32 (int __A, int __B, int __C, int __D,
+               int __E, int __F, int __G, int __H,
+               int __I, int __J, int __K, int __L,
+               int __M, int __N, int __O, int __P)
+{
+    //CHECK-LABEL: @test_mm512_setr_epi32
+    //CHECK: load{{.*}}%__P.addr, align 4
+    //CHECK: load{{.*}}%__O.addr, align 4
+    //CHECK: load{{.*}}%__N.addr, align 4
+    //CHECK: load{{.*}}%__M.addr, align 4
+    //CHECK: load{{.*}}%__L.addr, align 4
+    //CHECK: load{{.*}}%__K.addr, align 4
+    //CHECK: load{{.*}}%__J.addr, align 4
+    //CHECK: load{{.*}}%__I.addr, align 4
+    //CHECK: load{{.*}}%__H.addr, align 4
+    //CHECK: load{{.*}}%__G.addr, align 4
+    //CHECK: load{{.*}}%__F.addr, align 4
+    //CHECK: load{{.*}}%__E.addr, align 4
+    //CHECK: load{{.*}}%__D.addr, align 4
+    //CHECK: load{{.*}}%__C.addr, align 4
+    //CHECK: load{{.*}}%__B.addr, align 4
+    //CHECK: load{{.*}}%__A.addr, align 4
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+ return _mm512_setr_epi32( __A, __B, __C, __D,__E, __F, __G, __H,
+              __I, __J, __K, __L,__M, __N, __O, __P);
+}
+
+__m512i test_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+    //CHECK-LABEL: @test_mm512_mask_set1_epi64
+    //CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.512
+  return _mm512_mask_set1_epi64 (__O, __M, __A);
+}
+
+__m512i test_mm512_set_epi64 (long long __A, long long __B, long long __C,
+                              long long __D, long long __E, long long __F,
+                              long long __G, long long __H)
+{
+    //CHECK-LABEL: @test_mm512_set_epi64
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_set_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
+}
+
+__m512i test_mm512_setr_epi64 (long long __A, long long __B, long long __C,
+                              long long __D, long long __E, long long __F,
+                              long long __G, long long __H)
+{
+    //CHECK-LABEL: @test_mm512_setr_epi64
+    //CHECK: load{{.*}}%__H.addr, align 8
+    //CHECK: load{{.*}}%__G.addr, align 8
+    //CHECK: load{{.*}}%__F.addr, align 8
+    //CHECK: load{{.*}}%__E.addr, align 8
+    //CHECK: load{{.*}}%__D.addr, align 8
+    //CHECK: load{{.*}}%__C.addr, align 8
+    //CHECK: load{{.*}}%__B.addr, align 8
+    //CHECK: load{{.*}}%__A.addr, align 8
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_setr_epi64(__A, __B, __C, __D, __E, __F, __G, __H );
+}
+
+__m512d test_mm512_set_pd (double __A, double __B, double __C, double __D,
+                           double __E, double __F, double __G, double __H)
+{
+    //CHECK-LABEL: @test_mm512_set_pd
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_set_pd( __A, __B, __C, __D, __E, __F, __G, __H);
+}
+
+__m512d test_mm512_setr_pd (double __A, double __B, double __C, double __D,
+                           double __E, double __F, double __G, double __H)
+{
+    //CHECK-LABEL: @test_mm512_setr_pd
+    //CHECK: load{{.*}}%__H.addr, align 8
+    //CHECK: load{{.*}}%__G.addr, align 8
+    //CHECK: load{{.*}}%__F.addr, align 8
+    //CHECK: load{{.*}}%__E.addr, align 8
+    //CHECK: load{{.*}}%__D.addr, align 8
+    //CHECK: load{{.*}}%__C.addr, align 8
+    //CHECK: load{{.*}}%__B.addr, align 8
+    //CHECK: load{{.*}}%__A.addr, align 8
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+  return _mm512_setr_pd( __A, __B, __C, __D, __E, __F, __G, __H);
+}
+
+__m512 test_mm512_set_ps (float __A, float __B, float __C, float __D,
+                          float __E, float __F, float __G, float __H,
+                          float __I, float __J, float __K, float __L,
+                          float __M, float __N, float __O, float __P)
+{
+    //CHECK-LABEL: @test_mm512_set_ps
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+    return _mm512_set_ps( __A, __B, __C, __D, __E, __F, __G, __H,
+                          __I, __J, __K, __L, __M, __N, __O, __P);
+}
+
+__m512i test_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_abs_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pabs.q.512
+  return _mm512_mask_abs_epi64 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_abs_epi64 
+  // CHECK: @llvm.x86.avx512.mask.pabs.q.512
+  return _mm512_maskz_abs_epi64 (__U,__A);
+}
+
+__m512i test_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_mask_abs_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pabs.d.512
+  return _mm512_mask_abs_epi32 (__W,__U,__A);
+}
+
+__m512i test_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  // CHECK-LABEL: @test_mm512_maskz_abs_epi32 
+  // CHECK: @llvm.x86.avx512.mask.pabs.d.512
+  return _mm512_maskz_abs_epi32 (__U,__A);
+}
+
+__m512 test_mm512_setr_ps (float __A, float __B, float __C, float __D,
+                          float __E, float __F, float __G, float __H,
+                          float __I, float __J, float __K, float __L,
+                          float __M, float __N, float __O, float __P)
+{
+    //CHECK-LABEL: @test_mm512_setr_ps
+    //CHECK: load{{.*}}%__P.addr, align 4
+    //CHECK: load{{.*}}%__O.addr, align 4
+    //CHECK: load{{.*}}%__N.addr, align 4
+    //CHECK: load{{.*}}%__M.addr, align 4
+    //CHECK: load{{.*}}%__L.addr, align 4
+    //CHECK: load{{.*}}%__K.addr, align 4
+    //CHECK: load{{.*}}%__J.addr, align 4
+    //CHECK: load{{.*}}%__I.addr, align 4
+    //CHECK: load{{.*}}%__H.addr, align 4
+    //CHECK: load{{.*}}%__G.addr, align 4
+    //CHECK: load{{.*}}%__F.addr, align 4
+    //CHECK: load{{.*}}%__E.addr, align 4
+    //CHECK: load{{.*}}%__D.addr, align 4
+    //CHECK: load{{.*}}%__C.addr, align 4
+    //CHECK: load{{.*}}%__B.addr, align 4
+    //CHECK: load{{.*}}%__A.addr, align 4
+    //CHECK: insertelement{{.*}}i32 0
+    //CHECK: insertelement{{.*}}i32 1
+    //CHECK: insertelement{{.*}}i32 2
+    //CHECK: insertelement{{.*}}i32 3
+    //CHECK: insertelement{{.*}}i32 4
+    //CHECK: insertelement{{.*}}i32 5
+    //CHECK: insertelement{{.*}}i32 6
+    //CHECK: insertelement{{.*}}i32 7
+    //CHECK: insertelement{{.*}}i32 8
+    //CHECK: insertelement{{.*}}i32 9
+    //CHECK: insertelement{{.*}}i32 10
+    //CHECK: insertelement{{.*}}i32 11
+    //CHECK: insertelement{{.*}}i32 12
+    //CHECK: insertelement{{.*}}i32 13
+    //CHECK: insertelement{{.*}}i32 14
+    //CHECK: insertelement{{.*}}i32 15
+    return _mm512_setr_ps( __A, __B, __C, __D, __E, __F, __G, __H,
+                          __I, __J, __K, __L, __M, __N, __O, __P);
+}
+
+int test_mm_cvtss_i32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_i32
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvtss_i32(A);
+}
+
+long long test_mm_cvtss_i64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_i64
+  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
+  return _mm_cvtss_i64(A);
+}
+
+__m128d test_mm_cvti32_sd(__m128d A, int B) {
+  // CHECK-LABEL: test_mm_cvti32_sd
+  // CHECK: sitofp i32 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  return _mm_cvti32_sd(A, B);
+}
+
+__m128d test_mm_cvti64_sd(__m128d A, long long B) {
+  // CHECK-LABEL: test_mm_cvti64_sd
+  // CHECK: sitofp i64 %{{.*}} to double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
+  return _mm_cvti64_sd(A, B);
+}
+
+__m128 test_mm_cvti32_ss(__m128 A, int B) {
+  // CHECK-LABEL: test_mm_cvti32_ss
+  // CHECK: sitofp i32 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvti32_ss(A, B);
+}
+
+__m128 test_mm_cvti64_ss(__m128 A, long long B) {
+  // CHECK-LABEL: test_mm_cvti64_ss
+  // CHECK: sitofp i64 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvti64_ss(A, B);
+}
+
+int test_mm_cvtsd_i32(__m128d A) {
+  // CHECK-LABEL: test_mm_cvtsd_i32
+  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
+  return _mm_cvtsd_i32(A);
+}
+
+long long test_mm_cvtsd_i64(__m128d A) {
+  // CHECK-LABEL: test_mm_cvtsd_i64
+  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
+  return _mm_cvtsd_i64(A);
+}
+
+__m128d test_mm_mask_cvtss_sd(__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cvtss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_mask_cvtss_sd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_cvtss_sd( __mmask8 __U, __m128d __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvtss_sd
+  // CHECK: @llvm.x86.avx512.mask.cvtss2sd.round
+  return _mm_maskz_cvtss_sd( __U, __A, __B); 
+}
+
+__m128 test_mm_mask_cvtsd_ss(__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cvtsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_mask_cvtsd_ss(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_cvtsd_ss(__mmask8 __U, __m128 __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsd_ss
+  // CHECK: @llvm.x86.avx512.mask.cvtsd2ss.round
+  return _mm_maskz_cvtsd_ss(__U, __A, __B); 
+}
+
+
+__m512i test_mm512_setzero_epi32()
+{
+  // CHECK-LABEL: @test_mm512_setzero_epi32
+  // CHECK: zeroinitializer
+  return _mm512_setzero_epi32();
+}
+
+__m512i test_mm512_setzero()
+{
+  // CHECK-LABEL: @test_mm512_setzero
+  // CHECK: zeroinitializer
+  return _mm512_setzero();
+}
+
+__m512i test_mm512_setzero_si512()
+{
+  // CHECK-LABEL: @test_mm512_setzero_si512
+  // CHECK: zeroinitializer
+  return _mm512_setzero_si512();
+}
+
+__m512i test_mm512_setzero_ps()
+{
+  // CHECK-LABEL: @test_mm512_setzero_ps
+  // CHECK: zeroinitializer
+  return _mm512_setzero_ps();
+}
+
+__m512d test_mm512_setzero_pd()
+{
+  // CHECK-LABEL: @test_mm512_setzero_pd
+  // CHECK: zeroinitializer
+  return _mm512_setzero_pd();
+}
+
+__m512d test_mm512_abs_pd(__m512d a){
+  // CHECK-LABEL: @test_mm512_abs_pd
+  // CHECK: and <8 x i64> 
+  return _mm512_abs_pd(a);
+}
+
+__m512d test_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A){
+  // CHECK-LABEL: @test_mm512_mask_abs_pd 
+  // CHECK: %[[AND_RES:.*]] = and <8 x i64>
+  // CHECK: %[[MASK:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // CHECK: select <8 x i1> %[[MASK]], <8 x i64> %[[AND_RES]], <8 x i64> %{{.*}}
+  return _mm512_mask_abs_pd (__W,__U,__A);
+}
+
+__m512 test_mm512_abs_ps(__m512 a){
+  // CHECK-LABEL: @test_mm512_abs_ps
+  // CHECK: and <16 x i32> 
+  return _mm512_abs_ps(a);
+}
+
+__m512 test_mm512_mask_abs_ps(__m512 __W, __mmask16 __U, __m512 __A){
+  // CHECK-LABEL: @test_mm512_mask_abs_ps
+  // CHECK: and <16 x i32> 
+  // CHECK: %[[MASK:.*]] = bitcast i16 %{{.*}} to <16 x i1>
+  // CHECK: select <16 x i1> %[[MASK]], <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_abs_ps( __W, __U, __A);
+}
+
diff --git a/test/CodeGen/avx512ifma-builtins.c b/test/CodeGen/avx512ifma-builtins.c
new file mode 100644
index 0000000000000..366d3bec319bd
--- /dev/null
+++ b/test/CodeGen/avx512ifma-builtins.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -emit-llvm -o - -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m512i test_mm512_madd52hi_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  return _mm512_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m512i test_mm512_mask_madd52hi_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.512
+  return _mm512_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_madd52hi_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.512
+  return _mm512_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m512i test_mm512_madd52lo_epu64(__m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m512i test_mm512_mask_madd52lo_epu64(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_madd52lo_epu64(__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) {
+  // CHECK-LABEL: @test_mm512_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.512
+  return _mm512_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
diff --git a/test/CodeGen/avx512ifmavl-builtins.c b/test/CodeGen/avx512ifmavl-builtins.c
new file mode 100644
index 0000000000000..b38c1a8953adf
--- /dev/null
+++ b/test/CodeGen/avx512ifmavl-builtins.c
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512ifma -target-feature +avx512vl -emit-llvm -o - -Werror | FileCheck %s
+
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  return _mm_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m128i test_mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.128
+  return _mm_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.128
+  return _mm_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m256i test_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  return _mm256_madd52hi_epu64(__X, __Y, __Z); 
+}
+
+__m256i test_mm256_mask_madd52hi_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52h.uq.256
+  return _mm256_mask_madd52hi_epu64(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_madd52hi_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_maskz_madd52hi_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52h.uq.256
+  return _mm256_maskz_madd52hi_epu64(__M, __X, __Y, __Z); 
+}
+
+__m128i test_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  return _mm_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m128i test_mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.128
+  return _mm_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
+  // CHECK-LABEL: @test_mm_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.maskz.vpmadd52l.uq.128
+  return _mm_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
+
+__m256i test_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_madd52lo_epu64(__X, __Y, __Z); 
+}
+
+__m256i test_mm256_mask_madd52lo_epu64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_mask_madd52lo_epu64(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_madd52lo_epu64(__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
+  // CHECK-LABEL: @test_mm256_maskz_madd52lo_epu64
+  // CHECK: @llvm.x86.avx512.mask.vpmadd52l.uq.256
+  return _mm256_maskz_madd52lo_epu64(__M, __X, __Y, __Z); 
+}
diff --git a/test/CodeGen/avx512pf-builtins.c b/test/CodeGen/avx512pf-builtins.c
new file mode 100644
index 0000000000000..16b27e915136a
--- /dev/null
+++ b/test/CodeGen/avx512pf-builtins.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512pf -emit-llvm -o - -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+void test_mm512_mask_prefetch_i32gather_pd(__m256i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.dpd
+  return _mm512_mask_prefetch_i32gather_pd(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32gather_pd(__m256i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.dpd
+  return _mm512_prefetch_i32gather_pd(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i32gather_ps(__m512i index, __mmask16 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.dps
+  return _mm512_mask_prefetch_i32gather_ps(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32gather_ps(__m512i index,  void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.dps
+  return _mm512_prefetch_i32gather_ps(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i64gather_pd(__m512i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.qpd
+  return _mm512_mask_prefetch_i64gather_pd(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i64gather_pd(__m512i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gatherpf.qpd
+  return _mm512_prefetch_i64gather_pd(index, addr, 2, 1); 
+}
+
+void test_mm512_mask_prefetch_i64gather_ps(__m512i index, __mmask8 mask, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.qps
+  return _mm512_mask_prefetch_i64gather_ps(index, mask, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i64gather_ps(__m512i index, void const *addr, int hint) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gatherpf.qps
+  return _mm512_prefetch_i64gather_ps(index, addr, 2, 1); 
+}
+
+void test_mm512_prefetch_i32scatter_pd(void *addr, __m256i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
+  return _mm512_prefetch_i32scatter_pd(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i32scatter_pd(void *addr, __mmask8 mask, __m256i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.dpd.512
+  return _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i32scatter_ps(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
+  return _mm512_prefetch_i32scatter_ps(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i32scatter_ps(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i32scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.dps.512
+  return _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i64scatter_pd(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
+  return _mm512_prefetch_i64scatter_pd(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i64scatter_pd(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_pd
+  // CHECK: @llvm.x86.avx512.scatterpf.qpd.512
+  return _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, 1, 2); 
+}
+
+void test_mm512_prefetch_i64scatter_ps(void *addr, __m512i index) {
+  // CHECK-LABEL: @test_mm512_prefetch_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
+  return _mm512_prefetch_i64scatter_ps(addr, index, 1, 2); 
+}
+
+void test_mm512_mask_prefetch_i64scatter_ps(void *addr, __mmask16 mask, __m512i index) {
+  // CHECK-LABEL: @test_mm512_mask_prefetch_i64scatter_ps
+  // CHECK: @llvm.x86.avx512.scatterpf.qps.512
+  return _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, 1, 2); 
+}
diff --git a/test/CodeGen/avx512vbmi-builtins.c b/test/CodeGen/avx512vbmi-builtins.c
new file mode 100644
index 0000000000000..3fe97fd536b9d
--- /dev/null
+++ b/test/CodeGen/avx512vbmi-builtins.c
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.512
+  return _mm512_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m512i test_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  return _mm512_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m512i test_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.512
+  return _mm512_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.512
+  return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_permutexvar_epi8(__A, __B); 
+}
+
+__m512i test_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m512i test_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) {
+  // CHECK-LABEL: @test_mm512_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.512
+  return _mm512_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m512i test_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m512i test_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m512i test_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) {
+  // CHECK-LABEL: @test_mm512_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.512
+  return _mm512_multishift_epi64_epi8(__X, __Y); 
+}
diff --git a/test/CodeGen/avx512vbmivl-builtin.c b/test/CodeGen/avx512vbmivl-builtin.c
new file mode 100644
index 0000000000000..549c9656c0de4
--- /dev/null
+++ b/test/CodeGen/avx512vbmivl-builtin.c
@@ -0,0 +1,127 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_permutexvar_epi8(__A, __B); 
+}
+
+__m128i test_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m128i test_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.128
+  return _mm_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m256i test_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_permutexvar_epi8(__A, __B); 
+}
+
+__m256i test_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_maskz_permutexvar_epi8(__M, __A, __B); 
+}
+
+__m256i test_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi8
+  // CHECK: @llvm.x86.avx512.mask.permvar.qi.256
+  return _mm256_mask_permutexvar_epi8(__W, __M, __A, __B); 
+}
+
+__m128i test_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.128
+  return _mm_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m256i test_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermi2var.qi.256
+  return _mm256_mask2_permutex2var_epi8(__A, __I, __U, __B); 
+}
+
+__m128i test_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  return _mm_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m128i test_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.128
+  return _mm_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.128
+  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  return _mm256_permutex2var_epi8(__A, __I, __B); 
+}
+
+__m256i test_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.mask.vpermt2var.qi.256
+  return _mm256_mask_permutex2var_epi8(__A, __U, __I, __B); 
+}
+
+__m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi8
+  // CHECK: @llvm.x86.avx512.maskz.vpermt2var.qi.256
+  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
+}
+
+__m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m128i test_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m128i test_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.128
+  return _mm_multishift_epi64_epi8(__X, __Y); 
+}
+
+__m256i test_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_mask_multishift_epi64_epi8(__W, __M, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_maskz_multishift_epi64_epi8(__M, __X, __Y); 
+}
+
+__m256i test_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_multishift_epi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmultishift.qb.256
+  return _mm256_multishift_epi64_epi8(__X, __Y); 
+}
+
diff --git a/test/CodeGen/avx512vl-builtins.c b/test/CodeGen/avx512vl-builtins.c
index 445513ccfccb3..2b6ec3733830e 100644
--- a/test/CodeGen/avx512vl-builtins.c
+++ b/test/CodeGen/avx512vl-builtins.c
@@ -7,590 +7,602 @@
 
 __mmask8 test_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <4 x i1> %{{.*}}, <4 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   return (__mmask8)_mm_cmpeq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 0, i8 -1)
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: shufflevector <2 x i1> %{{.*}}, <2 x i1> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
   return (__mmask8)_mm_cmpeq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 0, i8 {{.*}})
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp sge <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp sge <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 -1)
+  // CHECK: icmp uge <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpge_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 5, i8 {{.*}})
+  // CHECK: icmp uge <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpge_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpgt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpgt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 6, i8 -1)
+  // CHECK: icmp ugt <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpgt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 6, i8 {{.*}})
+  // CHECK: icmp ugt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpgt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp sle <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp sle <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 -1)
+  // CHECK: icmp ule <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmple_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 2, i8 {{.*}})
+  // CHECK: icmp ule <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmple_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp slt <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp slt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 -1)
+  // CHECK: icmp ult <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmplt_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 1, i8 {{.*}})
+  // CHECK: icmp ult <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmplt_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epi32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epi32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epi64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epi64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epu32_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epu32_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 -1)
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_cmpneq_epu64_mask(__a, __b);
 }
 
 __mmask8 test_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 4, i8 {{.*}})
+  // CHECK: icmp ne <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm256_mask_cmpneq_epu64_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmp_epi32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epi64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi64_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epi32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epi32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi32_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epi32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epi64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epi64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi64_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epi64_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epu32_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epu32_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> {{.*}}, <4 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epu32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epu64_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm_cmp_epu64_mask(__a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm_mask_cmp_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> {{.*}}, <2 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm_mask_cmp_epu64_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epu32_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epu32_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epu32_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu32_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> {{.*}}, <8 x i32> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epu32_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epu32_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm256_cmp_epu64_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 -1)
-  return (__mmask8)_mm256_cmp_epu64_mask(__a, __b, 7);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmp_epu64_mask(__a, __b, 0);
 }
 
 __mmask8 test_mm256_mask_cmp_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu64_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> {{.*}}, <4 x i64> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask8)_mm256_mask_cmp_epu64_mask(__u, __a, __b, 7);
-}
-
-__m512i test_mm512_maskz_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_maskz_andnot_epi32(__k,__A,__B);
-}
-
-__m512i test_mm512_mask_andnot_epi32 (__mmask16 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_mask_andnot_epi32(__src,__k,__A,__B);
-}
-
-__m512i test_mm512_andnot_epi32(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.512
-  return _mm512_andnot_epi32(__A,__B);
-}
-
-__m512i test_mm512_maskz_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_maskz_andnot_epi64(__k,__A,__B);
-}
-
-__m512i test_mm512_mask_andnot_epi64 (__mmask8 __k,__m512i __A, __m512i __B, __m512i __src) {
-  //CHECK-LABEL: @test_mm512_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_mask_andnot_epi64(__src,__k,__A,__B);
-}
-
-__m512i test_mm512_andnot_epi64(__m512i __A, __m512i __B) {
-  //CHECK-LABEL: @test_mm512_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.512
-  return _mm512_andnot_epi64(__A,__B);
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmp_epu64_mask(__u, __a, __b, 0);
 }
 
 __m256i test_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
@@ -780,204 +792,232 @@ __m256i test_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
 __m256i test_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_and_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_and_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_and_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_and_epi32
-  //CHECK: @llvm.x86.avx512.mask.pand.d.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_and_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_andnot_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_andnot_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_andnot_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_andnot_epi32
-  //CHECK: @llvm.x86.avx512.mask.pandn.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_andnot_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_or_epi32(__W, __U, __A, __B);
 }
 
  __m256i test_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_or_epi32(__U, __A, __B);
 }
 
  __m128i test_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_or_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_or_epi32
-  //CHECK: @llvm.x86.avx512.mask.por.d.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_or_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_xor_epi32(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_xor_epi32(__U, __A, __B);
 }
 
 __m128i test_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_xor_epi32(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_xor_epi32
-  //CHECK: @llvm.x86.avx512.mask.pxor.d.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_xor_epi32(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
            __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_and_epi64(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.256
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_and_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
         __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_and_epi64(__W,__U, __A, __B);
 }
 
 __m128i test_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_and_epi64
-  //CHECK: @llvm.x86.avx512.mask.pand.q.128
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_and_epi64(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
         __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_andnot_epi64(__W, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, <i64 -1, i64 -1, i64 -1, i64 -1>
+  //CHECK: and <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_andnot_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
            __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_andnot_epi64(__W,__U, __A, __B);
 }
 
 __m128i test_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_andnot_epi64
-  //CHECK: @llvm.x86.avx512.mask.pandn.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  //CHECK: and <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_andnot_epi64(__U, __A, __B);
 }
 
 __m256i test_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_or_epi64(__W,__U, __A, __B);
 }
 
 __m256i test_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.256
+  //CHECK: or <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_or_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.128
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_or_epi64(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
-//CHECK-LABEL: @test_mm_maskz_or_epi64
-  //CHECK: @llvm.x86.avx512.mask.por.q.128
+  //CHECK-LABEL: @test_mm_maskz_or_epi64
+  //CHECK: or <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_or_epi64( __U, __A, __B);
 }
 
 __m256i test_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
           __m256i __B) {
   //CHECK-LABEL: @test_mm256_mask_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
   return _mm256_mask_xor_epi64(__W,__U, __A, __B);
 }
 
 __m256i test_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) {
   //CHECK-LABEL: @test_mm256_maskz_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.256
+  //CHECK: xor <4 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_xor_epi64(__U, __A, __B);
 }
 
 __m128i test_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_mask_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
   return _mm_mask_xor_epi64(__W, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) {
   //CHECK-LABEL: @test_mm_maskz_xor_epi64
-  //CHECK: @llvm.x86.avx512.mask.pxor.q.128
+  //CHECK: xor <2 x i64> %{{.*}}, %{{.*}}
+  //CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_xor_epi64( __U, __A, __B);
 }
 
@@ -993,16 +1033,16 @@ __mmask8 test_mm256_mask_cmp_ps_mask(__mmask8 m, __m256 __A, __m256 __B) {
   return _mm256_mask_cmp_ps_mask(m, __A, __B, 0);
 }
 
-__mmask8 test_mm128_cmp_ps_mask(__m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm128_cmp_ps_mask
+__mmask8 test_mm_cmp_ps_mask(__m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_cmp_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return (__mmask8)_mm128_cmp_ps_mask(__A, __B, 0);
+  return (__mmask8)_mm_cmp_ps_mask(__A, __B, 0);
 }
 
-__mmask8 test_mm128_mask_cmp_ps_mask(__mmask8 m, __m128 __A, __m128 __B) {
-  // CHECK-LABEL: @test_mm128_mask_cmp_ps_mask
+__mmask8 test_mm_mask_cmp_ps_mask(__mmask8 m, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_cmp_ps_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.ps.128
-  return _mm128_mask_cmp_ps_mask(m, __A, __B, 0);
+  return _mm_mask_cmp_ps_mask(m, __A, __B, 0);
 }
 
 __mmask8 test_mm256_cmp_pd_mask(__m256d __A, __m256d __B) {
@@ -1017,21 +1057,18 @@ __mmask8 test_mm256_mask_cmp_pd_mask(__mmask8 m, __m256d __A, __m256d __B) {
   return _mm256_mask_cmp_pd_mask(m, __A, __B, 0);
 }
 
-__mmask8 test_mm128_cmp_pd_mask(__m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm128_cmp_pd_mask
+__mmask8 test_mm_cmp_pd_mask(__m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_cmp_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return (__mmask8)_mm128_cmp_pd_mask(__A, __B, 0);
+  return (__mmask8)_mm_cmp_pd_mask(__A, __B, 0);
 }
 
-__mmask8 test_mm128_mask_cmp_pd_mask(__mmask8 m, __m128d __A, __m128d __B) {
-  // CHECK-LABEL: @test_mm128_mask_cmp_pd_mask
+__mmask8 test_mm_mask_cmp_pd_mask(__mmask8 m, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_cmp_pd_mask
   // CHECK: @llvm.x86.avx512.mask.cmp.pd.128
-  return _mm128_mask_cmp_pd_mask(m, __A, __B, 0);
+  return _mm_mask_cmp_pd_mask(m, __A, __B, 0);
 }
 
-
-//igorb
-
 __m128d test_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) {
   // CHECK-LABEL: @test_mm_mask_fmadd_pd
   // CHECK: @llvm.x86.avx512.mask.vfmadd.pd.128
@@ -1506,42 +1543,42 @@ __m256 test_mm256_maskz_add_ps(__mmask16 __U, __m256 __A, __m256 __B) {
 }
 __m128i test_mm_mask_blend_epi32(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi32
-  // CHECK: @llvm.x86.avx512.mask.blend.d.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_blend_epi32(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi32(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi32
-  // CHECK: @llvm.x86.avx512.mask.blend.d.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_blend_epi32(__U,__A,__W); 
 }
 __m128d test_mm_mask_blend_pd(__mmask8 __U, __m128d __A, __m128d __W) {
   // CHECK-LABEL: @test_mm_mask_blend_pd
-  // CHECK: @llvm.x86.avx512.mask.blend.pd.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_blend_pd(__U,__A,__W); 
 }
 __m256d test_mm256_mask_blend_pd(__mmask8 __U, __m256d __A, __m256d __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_pd
-  // CHECK: @llvm.x86.avx512.mask.blend.pd.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_blend_pd(__U,__A,__W); 
 }
 __m128 test_mm_mask_blend_ps(__mmask8 __U, __m128 __A, __m128 __W) {
   // CHECK-LABEL: @test_mm_mask_blend_ps
-  // CHECK: @llvm.x86.avx512.mask.blend.ps.128
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_blend_ps(__U,__A,__W); 
 }
 __m256 test_mm256_mask_blend_ps(__mmask8 __U, __m256 __A, __m256 __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_ps
-  // CHECK: @llvm.x86.avx512.mask.blend.ps.256
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_blend_ps(__U,__A,__W); 
 }
 __m128i test_mm_mask_blend_epi64(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi64
-  // CHECK: @llvm.x86.avx512.mask.blend.q.128
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_blend_epi64(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi64(__mmask8 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi64
-  // CHECK: @llvm.x86.avx512.mask.blend.q.256
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_blend_epi64(__U,__A,__W); 
 }
 __m128d test_mm_mask_compress_pd(__m128d __W, __mmask8 __U, __m128d __A) {
@@ -3154,3 +3191,3734 @@ __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i _
   // CHECK: @llvm.x86.avx512.maskz.vpermt2var.q.256
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+
+__m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  return _mm_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.128
+  return _mm_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  return _mm256_mask_cvtepi8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.d.256
+  return _mm256_maskz_cvtepi8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  return _mm_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.128
+  return _mm_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  return _mm256_mask_cvtepi8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.q.256
+  return _mm256_maskz_cvtepi8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  return _mm_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.128
+  return _mm_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  return _mm256_mask_cvtepi32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxd.q.256
+  return _mm256_maskz_cvtepi32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  return _mm_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.128
+  return _mm_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  return _mm256_mask_cvtepi16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.d.256
+  return _mm256_maskz_cvtepi16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  return _mm_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.128
+  return _mm_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  return _mm256_mask_cvtepi16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovsxw.q.256
+  return _mm256_maskz_cvtepi16_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_maskz_cvtepu8_epi32(__U, __A);
+}
+
+__m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m128i test_mm_rol_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_rol_epi32(__A, 5); 
+}
+
+__m128i test_mm_mask_rol_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_rol_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.128
+  return _mm_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_rol_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_rol_epi32(__A, 5); 
+}
+
+__m256i test_mm256_mask_rol_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_mask_rol_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_rol_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rol_epi32
+  // CHECK: @llvm.x86.avx512.mask.prol.d.256
+  return _mm256_maskz_rol_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_rol_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_rol_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_rol_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_rol_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.128
+  return _mm_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_rol_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_rol_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_rol_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_mask_rol_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_rol_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rol_epi64
+  // CHECK: @llvm.x86.avx512.mask.prol.q.256
+  return _mm256_maskz_rol_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_rolv_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_rolv_epi32(__A, __B); 
+}
+
+__m128i test_mm_mask_rolv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rolv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.128
+  return _mm_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_rolv_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_rolv_epi32(__A, __B); 
+}
+
+__m256i test_mm256_mask_rolv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_mask_rolv_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rolv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rolv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prolv.d.256
+  return _mm256_maskz_rolv_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_rolv_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_rolv_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_rolv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rolv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.128
+  return _mm_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_rolv_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_rolv_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_rolv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_mask_rolv_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rolv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rolv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prolv.q.256
+  return _mm256_maskz_rolv_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_ror_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_ror_epi32(__A, 5); 
+}
+
+__m128i test_mm_mask_ror_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_ror_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.128
+  return _mm_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_ror_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_ror_epi32(__A, 5); 
+}
+
+__m256i test_mm256_mask_ror_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_mask_ror_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_ror_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_ror_epi32
+  // CHECK: @llvm.x86.avx512.mask.pror.d.256
+  return _mm256_maskz_ror_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_ror_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_ror_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_ror_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_mask_ror_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_ror_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.128
+  return _mm_maskz_ror_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_ror_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_ror_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_ror_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_mask_ror_epi64(__W, __U, __A,5); 
+}
+
+__m256i test_mm256_maskz_ror_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_ror_epi64
+  // CHECK: @llvm.x86.avx512.mask.pror.q.256
+  return _mm256_maskz_ror_epi64(__U, __A, 5); 
+}
+
+
+__m128i test_mm_rorv_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_rorv_epi32(__A, __B); 
+}
+
+__m128i test_mm_mask_rorv_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rorv_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.128
+  return _mm_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_rorv_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_rorv_epi32(__A, __B); 
+}
+
+__m256i test_mm256_mask_rorv_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_mask_rorv_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rorv_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rorv_epi32
+  // CHECK: @llvm.x86.avx512.mask.prorv.d.256
+  return _mm256_maskz_rorv_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_rorv_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_rorv_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_rorv_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_rorv_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.128
+  return _mm_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_rorv_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_rorv_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_rorv_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_mask_rorv_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_rorv_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_rorv_epi64
+  // CHECK: @llvm.x86.avx512.mask.prorv.q.256
+  return _mm256_maskz_rorv_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi64(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_mask_srlv_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srlv_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_maskz_srlv_epi64(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_mask_srlv_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srlv_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrlv
+  return _mm256_maskz_srlv_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  return _mm_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.128
+  return _mm_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  return _mm256_mask_srl_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srl_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.d.256
+  return _mm256_maskz_srl_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  return _mm_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.128
+  return _mm_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  return _mm256_mask_srli_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srli_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrl.di.256
+  return _mm256_maskz_srli_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  return _mm_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.128
+  return _mm_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  return _mm256_mask_srl_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_srl_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.q.256
+  return _mm256_maskz_srl_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  return _mm_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.128
+  return _mm_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  return _mm256_mask_srli_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srli_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrl.qi.256
+  return _mm256_maskz_srli_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m256i test_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm256_mask_srav_epi32(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srav_epi32
+  // CHECK: @llvm.x86.avx512.mask.psrav
+  return _mm256_maskz_srav_epi32(__U, __X, __Y); 
+}
+
+__m128i test_mm_srav_epi64(__m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_srav_epi64(__X, __Y); 
+}
+
+__m128i test_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m128i test_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) {
+  // CHECK-LABEL: @test_mm_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.128
+  return _mm_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+__m256i test_mm256_srav_epi64(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_srav_epi64(__X, __Y); 
+}
+
+__m256i test_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_mask_srav_epi64(__W, __U, __X, __Y); 
+}
+
+__m256i test_mm256_maskz_srav_epi64(__mmask8 __U, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_srav_epi64
+  // CHECK: @llvm.x86.avx512.mask.psrav.q.256
+  return _mm256_maskz_srav_epi64(__U, __X, __Y); 
+}
+
+void test_mm_mask_store_epi32(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_store_epi32
+  // CHECK: @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %{{.*}}, <4 x i32>* %{{.}}, i32 16, <4 x i1> %{{.*}})
+  return _mm_mask_store_epi32(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_epi32(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_epi32
+  // CHECK: @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %{{.*}}, <8 x i32>* %{{.}}, i32 32, <8 x i1> %{{.*}})
+  return _mm256_mask_store_epi32(__P, __U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi32
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi32
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_mov_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi32
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_mov_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi32
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_mov_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi64
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi64
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_mov_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi64
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_mov_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi64
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_mov_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_load_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_load_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_load_epi32(__U, __P); 
+}
+
+__m256i test_mm256_mask_load_epi32(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mask_load_epi32(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_load_epi32(__U, __P); 
+}
+
+__m128i test_mm_mask_load_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_mask_load_epi64(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskz_load_epi64(__U, __P); 
+}
+
+__m256i test_mm256_mask_load_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_mask_load_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_load_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskz_load_epi64(__U, __P); 
+}
+
+void test_mm_mask_store_epi64(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_store_epi64
+  // CHECK: @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  return _mm_mask_store_epi64(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_epi64(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_epi64
+  // CHECK: @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  return _mm256_mask_store_epi64(__P, __U, __A); 
+}
+
+__m128d test_mm_mask_movedup_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_movedup_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_movedup_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_movedup_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_movedup_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_movedup_pd(__U, __A); 
+}
+
+__m256d test_mm256_mask_movedup_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_movedup_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_movedup_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_movedup_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_movedup_pd(__U, __A); 
+}
+
+__m128i test_mm_mask_set1_epi32(__m128i __O, __mmask8 __M) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.128
+  return _mm_mask_set1_epi32(__O, __M, 5); 
+}
+
+__m128i test_mm_maskz_set1_epi32(__mmask8 __M) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.128
+  return _mm_maskz_set1_epi32(__M, 5); 
+}
+
+__m256i test_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.256
+  return _mm256_mask_set1_epi32(__O, __M, 5); 
+}
+
+__m256i test_mm256_maskz_set1_epi32(__mmask8 __M) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi32
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.d.gpr.256
+  return _mm256_maskz_set1_epi32(__M, 5); 
+}
+
+__m128i test_mm_mask_set1_epi64(__m128i __O, __mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.128
+  return _mm_mask_set1_epi64(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.128
+  return _mm_maskz_set1_epi64(__M, __A); 
+}
+
+__m256i test_mm256_mask_set1_epi64(__m256i __O, __mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.256
+  return _mm256_mask_set1_epi64(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_set1_epi64(__mmask8 __M, long long __A) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi64
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.q.gpr.256
+  return _mm256_maskz_set1_epi64(__M, __A); 
+}
+
+__m128d test_mm_fixupimm_pd(__m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
+  return _mm_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_fixupimm_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.128
+  return _mm_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m128d test_mm_maskz_fixupimm_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.128
+  return _mm_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m256d test_mm256_fixupimm_pd(__m256d __A, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.256
+  return _mm256_fixupimm_pd(__A, __B, __C, 5); 
+}
+
+__m256d test_mm256_mask_fixupimm_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.pd.256
+  return _mm256_mask_fixupimm_pd(__A, __U, __B, __C, 5); 
+}
+
+__m256d test_mm256_maskz_fixupimm_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fixupimm_pd
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.pd.256
+  return _mm256_maskz_fixupimm_pd(__U, __A, __B, __C, 5); 
+}
+
+__m128 test_mm_fixupimm_ps(__m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.128
+  return _mm_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m128 test_mm_mask_fixupimm_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.128
+  return _mm_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m128 test_mm_maskz_fixupimm_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.128
+  return _mm_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m256 test_mm256_fixupimm_ps(__m256 __A, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.256
+  return _mm256_fixupimm_ps(__A, __B, __C, 5); 
+}
+
+__m256 test_mm256_mask_fixupimm_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.mask.fixupimm.ps.256
+  return _mm256_mask_fixupimm_ps(__A, __U, __B, __C, 5); 
+}
+
+__m256 test_mm256_maskz_fixupimm_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_fixupimm_ps
+  // CHECK: @llvm.x86.avx512.maskz.fixupimm.ps.256
+  return _mm256_maskz_fixupimm_ps(__U, __A, __B, __C, 5); 
+}
+
+__m128d test_mm_mask_load_pd(__m128d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_load_pd(__W, __U, __P); 
+}
+
+__m128d test_mm_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_load_pd(__U, __P); 
+}
+
+__m256d test_mm256_mask_load_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_load_pd(__W, __U, __P); 
+}
+
+__m256d test_mm256_maskz_load_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_load_pd(__U, __P); 
+}
+
+__m128 test_mm_mask_load_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_load_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_load_ps(__W, __U, __P); 
+}
+
+__m128 test_mm_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_load_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_load_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_load_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_load_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_load_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_load_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_load_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_load_ps(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi64(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi64
+  // CHECK: @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x i64> %{{.*}})
+  return _mm_maskz_loadu_epi64(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi64(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_mask_loadu_epi64(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi64(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi64
+  // CHECK: @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i64> %{{.*}})
+  return _mm256_maskz_loadu_epi64(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi32(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi32
+  // CHECK: @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x i32> %{{.*}})
+  return _mm_maskz_loadu_epi32(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi32(__m256i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_mask_loadu_epi32(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi32(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi32
+  // CHECK: @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i32> %{{.*}})
+  return _mm256_maskz_loadu_epi32(__U, __P); 
+}
+
+__m128d test_mm_mask_loadu_pd(__m128d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_mask_loadu_pd(__W, __U, __P); 
+}
+
+__m128d test_mm_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_pd
+  // CHECK: @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}}, <2 x double> %{{.*}})
+  return _mm_maskz_loadu_pd(__U, __P); 
+}
+
+__m256d test_mm256_mask_loadu_pd(__m256d __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_mask_loadu_pd(__W, __U, __P); 
+}
+
+__m256d test_mm256_maskz_loadu_pd(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_pd
+  // CHECK: @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x double> %{{.*}})
+  return _mm256_maskz_loadu_pd(__U, __P); 
+}
+
+__m128 test_mm_mask_loadu_ps(__m128 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_mask_loadu_ps(__W, __U, __P); 
+}
+
+__m128 test_mm_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_ps
+  // CHECK: @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_maskz_loadu_ps(__U, __P); 
+}
+
+__m256 test_mm256_mask_loadu_ps(__m256 __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_mask_loadu_ps(__W, __U, __P); 
+}
+
+__m256 test_mm256_maskz_loadu_ps(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_ps
+  // CHECK: @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x float> %{{.*}})
+  return _mm256_maskz_loadu_ps(__U, __P); 
+}
+
+void test_mm_mask_store_pd(void *__P, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_store_pd
+  // CHECK: @llvm.masked.store.v2f64.p0v2f64(<2 x double> %{{.*}}, <2 x double>* %{{.*}}, i32 16, <2 x i1> %{{.*}})
+  return _mm_mask_store_pd(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_pd(void *__P, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_pd
+  // CHECK: @llvm.masked.store.v4f64.p0v4f64(<4 x double> %{{.*}}, <4 x double>* %{{.*}}, i32 32, <4 x i1> %{{.*}})
+  return _mm256_mask_store_pd(__P, __U, __A); 
+}
+
+void test_mm_mask_store_ps(void *__P, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_store_ps
+  // CHECK: @llvm.masked.store.v4f32.p0v4f32(<4 x float> %{{.*}}, <4 x float>* %{{.*}}, i32 16, <4 x i1> %{{.*}})
+  return _mm_mask_store_ps(__P, __U, __A); 
+}
+
+void test_mm256_mask_store_ps(void *__P, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_store_ps
+  // CHECK: @llvm.masked.store.v8f32.p0v8f32(<8 x float> %{{.*}}, <8 x float>* %{{.*}}, i32 32, <8 x i1> %{{.*}})
+  return _mm256_mask_store_ps(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi64(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  return _mm_mask_storeu_epi64(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi64(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi64
+  // CHECK: @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %{{.*}}, <4 x i64>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi64(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi32(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi32(void *__P, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi32
+  // CHECK: @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %{{.*}}, <8 x i32>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi32(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_pd(void *__P, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_pd
+  // CHECK: @llvm.masked.store.v2f64.p0v2f64(<2 x double> %{{.*}}, <2 x double>* %{{.*}}, i32 1, <2 x i1> %{{.*}})
+  return _mm_mask_storeu_pd(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_pd(void *__P, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_pd
+  // CHECK: @llvm.masked.store.v4f64.p0v4f64(<4 x double> %{{.*}}, <4 x double>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm256_mask_storeu_pd(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_ps(void *__P, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_ps
+  // CHECK: @llvm.masked.store.v4f32.p0v4f32(<4 x float> %{{.*}}, <4 x float>* %{{.*}}, i32 1, <4 x i1> %{{.*}})
+  return _mm_mask_storeu_ps(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_ps(void *__P, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_ps
+  // CHECK: @llvm.masked.store.v8f32.p0v8f32(<8 x float> %{{.*}}, <8 x float>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm256_mask_storeu_ps(__P, __U, __A); 
+}
+
+__m128d test_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_unpackhi_pd(__U, __A, __B); 
+}
+
+__m256d test_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_pd
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_unpackhi_pd(__W, __U, __A, __B); 
+}
+
+__m256d test_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_pd
+  // CHECK: shufflevector <8 x i1> %{{.*}}, <8 x i1> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_unpackhi_pd(__U, __A, __B); 
+}
+
+__m128 test_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}} <4 x float> %{{.*}}
+  return _mm_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}} <4 x float> %{{.*}}
+  return _mm_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m256 test_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_unpackhi_ps(__W, __U, __A, __B); 
+}
+
+__m256 test_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_unpackhi_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m128d test_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m256d test_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_unpacklo_pd(__W, __U, __A, __B); 
+}
+
+__m256d test_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}} <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_unpacklo_pd(__U, __A, __B); 
+}
+
+__m128 test_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m128 test_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_unpacklo_ps(__U, __A, __B); 
+}
+
+__m256 test_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_unpacklo_ps(__W, __U, __A, __B); 
+}
+
+__m256 test_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_unpacklo_ps(__U, __A, __B); 
+}
+
+__m128d test_mm_rcp14_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_rcp14_pd(__A); 
+}
+
+__m128d test_mm_mask_rcp14_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_mask_rcp14_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_rcp14_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.128
+  return _mm_maskz_rcp14_pd(__U, __A); 
+}
+
+__m256d test_mm256_rcp14_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_rcp14_pd(__A); 
+}
+
+__m256d test_mm256_mask_rcp14_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_mask_rcp14_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_rcp14_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp14_pd
+  // CHECK: @llvm.x86.avx512.rcp14.pd.256
+  return _mm256_maskz_rcp14_pd(__U, __A); 
+}
+
+__m128 test_mm_rcp14_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_rcp14_ps(__A); 
+}
+
+__m128 test_mm_mask_rcp14_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_mask_rcp14_ps(__W, __U, __A); 
+}
+
+__m128 test_mm_maskz_rcp14_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.128
+  return _mm_maskz_rcp14_ps(__U, __A); 
+}
+
+__m256 test_mm256_rcp14_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_rcp14_ps(__A); 
+}
+
+__m256 test_mm256_mask_rcp14_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_mask_rcp14_ps(__W, __U, __A); 
+}
+
+__m256 test_mm256_maskz_rcp14_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rcp14_ps
+  // CHECK: @llvm.x86.avx512.rcp14.ps.256
+  return _mm256_maskz_rcp14_ps(__U, __A); 
+}
+
+__m128d test_mm_mask_permute_pd(__m128d __W, __mmask8 __U, __m128d __X) {
+  // CHECK-LABEL: @test_mm_mask_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_permute_pd(__W, __U, __X, 1); 
+}
+
+__m128d test_mm_maskz_permute_pd(__mmask8 __U, __m128d __X) {
+  // CHECK-LABEL: @test_mm_maskz_permute_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_permute_pd(__U, __X, 1); 
+}
+
+__m256d test_mm256_mask_permute_pd(__m256d __W, __mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_mask_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_permute_pd(__W, __U, __X, 5); 
+}
+
+__m256d test_mm256_maskz_permute_pd(__mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permute_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_permute_pd(__U, __X, 5); 
+}
+
+__m128 test_mm_mask_permute_ps(__m128 __W, __mmask8 __U, __m128 __X) {
+  // CHECK-LABEL: @test_mm_mask_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_permute_ps(__W, __U, __X, 0x1b); 
+}
+
+__m128 test_mm_maskz_permute_ps(__mmask8 __U, __m128 __X) {
+  // CHECK-LABEL: @test_mm_maskz_permute_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_permute_ps(__U, __X, 0x1b); 
+}
+
+__m256 test_mm256_mask_permute_ps(__m256 __W, __mmask8 __U, __m256 __X) {
+  // CHECK-LABEL: @test_mm256_mask_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_permute_ps(__W, __U, __X, 0x1b); 
+}
+
+__m256 test_mm256_maskz_permute_ps(__mmask8 __U, __m256 __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permute_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_permute_ps(__U, __X, 0x1b); 
+}
+
+__m128d test_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  return _mm_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m128d test_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd
+  return _mm_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m256d test_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  return _mm256_mask_permutevar_pd(__W, __U, __A, __C); 
+}
+
+__m256d test_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_permutevar_pd
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.pd.256
+  return _mm256_maskz_permutevar_pd(__U, __A, __C); 
+}
+
+__m128 test_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  return _mm_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m128 test_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps
+  return _mm_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__m256 test_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  return _mm256_mask_permutevar_ps(__W, __U, __A, __C); 
+}
+
+__m256 test_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_permutevar_ps
+  // CHECK: @llvm.x86.avx512.mask.vpermilvar.ps.256
+  return _mm256_maskz_permutevar_ps(__U, __A, __C); 
+}
+
+__mmask8 test_mm_test_epi32_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.128
+  return _mm_test_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.128
+  return _mm_mask_test_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_test_epi32_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.256
+  return _mm256_test_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_test_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestm.d.256
+  return _mm256_mask_test_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_test_epi64_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.128
+  return _mm_test_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.128
+  return _mm_mask_test_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_test_epi64_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.256
+  return _mm256_test_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_test_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestm.q.256
+  return _mm256_mask_test_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi32_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.128
+  return _mm_testn_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi32_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.128
+  return _mm_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_testn_epi32_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.256
+  return _mm256_testn_epi32_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_testn_epi32_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi32_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.d.256
+  return _mm256_mask_testn_epi32_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi64_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.128
+  return _mm_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi64_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.128
+  return _mm_mask_testn_epi64_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm256_testn_epi64_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.256
+  return _mm256_testn_epi64_mask(__A, __B); 
+}
+
+__mmask8 test_mm256_mask_testn_epi64_mask(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi64_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.q.256
+  return _mm256_mask_testn_epi64_mask(__U, __A, __B); 
+}
+__m128i test_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_unpackhi_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_unpackhi_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpackhi_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpackhi_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_unpackhi_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_unpackhi_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_unpacklo_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_unpacklo_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_unpacklo_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_unpacklo_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_unpacklo_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_unpacklo_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  return _mm_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.128
+  return _mm_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  return _mm256_mask_sra_epi32(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sra_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.d.256
+  return _mm256_maskz_sra_epi32(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  return _mm_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.128
+  return _mm_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  return _mm256_mask_srai_epi32(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srai_epi32
+  // CHECK: @llvm.x86.avx512.mask.psra.di.256
+  return _mm256_maskz_srai_epi32(__U, __A, 5); 
+}
+
+__m128i test_mm_sra_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_sra_epi64(__A, __B); 
+}
+
+__m128i test_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.128
+  return _mm_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m256i test_mm256_sra_epi64(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_sra_epi64(__A, __B); 
+}
+
+__m256i test_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_mask_sra_epi64(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sra_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.q.256
+  return _mm256_maskz_sra_epi64(__U, __A, __B); 
+}
+
+__m128i test_mm_srai_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_srai_epi64(__A, 5); 
+}
+
+__m128i test_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.128
+  return _mm_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m256i test_mm256_srai_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_srai_epi64(__A, 5); 
+}
+
+__m256i test_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_mask_srai_epi64(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_srai_epi64
+  // CHECK: @llvm.x86.avx512.mask.psra.qi.256
+  return _mm256_maskz_srai_epi64(__U, __A, 5); 
+}
+
+__m128i test_mm_ternarylogic_epi32(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  return _mm_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m128i test_mm_mask_ternarylogic_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.128
+  return _mm_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m128i test_mm_maskz_ternarylogic_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.128
+  return _mm_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m256i test_mm256_ternarylogic_epi32(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  return _mm256_ternarylogic_epi32(__A, __B, __C, 4); 
+}
+
+__m256i test_mm256_mask_ternarylogic_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.mask.pternlog.d.256
+  return _mm256_mask_ternarylogic_epi32(__A, __U, __B, __C, 4); 
+}
+
+__m256i test_mm256_maskz_ternarylogic_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi32
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.d.256
+  return _mm256_maskz_ternarylogic_epi32(__U, __A, __B, __C, 4); 
+}
+
+__m128i test_mm_ternarylogic_epi64(__m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  return _mm_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m128i test_mm_mask_ternarylogic_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.128
+  return _mm_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m128i test_mm_maskz_ternarylogic_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
+  // CHECK-LABEL: @test_mm_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.128
+  return _mm_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+
+__m256i test_mm256_ternarylogic_epi64(__m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  return _mm256_ternarylogic_epi64(__A, __B, __C, 4); 
+}
+
+__m256i test_mm256_mask_ternarylogic_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_mask_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.mask.pternlog.q.256
+  return _mm256_mask_ternarylogic_epi64(__A, __U, __B, __C, 4); 
+}
+
+__m256i test_mm256_maskz_ternarylogic_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
+  // CHECK-LABEL: @test_mm256_maskz_ternarylogic_epi64
+  // CHECK: @llvm.x86.avx512.maskz.pternlog.q.256
+  return _mm256_maskz_ternarylogic_epi64(__U, __A, __B, __C, 4); 
+}
+__m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_shuffle_f32x4(__A, __B, 3); 
+}
+
+__m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); 
+}
+
+__m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_f32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.f32x4
+  return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); 
+}
+
+__m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_shuffle_f64x2(__A, __B, 3); 
+}
+
+__m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); 
+}
+
+__m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_f64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.f64x2
+  return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); 
+}
+
+__m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_shuffle_i32x4(__A, __B, 3); 
+}
+
+__m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
+}
+
+__m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
+  // CHECK: @llvm.x86.avx512.mask.shuf.i32x4
+  return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
+}
+
+__m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_shuffle_i64x2(__A, __B, 3); 
+}
+
+__m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); 
+}
+
+__m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_i64x2
+  // CHECK: @llvm.x86.avx512.mask.shuf.i64x2
+  return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); 
+}
+
+__m128d test_mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_shuffle_pd(__W, __U, __A, __B, 3); 
+}
+
+__m128d test_mm_maskz_shuffle_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_shuffle_pd(__U, __A, __B, 3); 
+}
+
+__m256d test_mm256_mask_shuffle_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_shuffle_pd(__W, __U, __A, __B, 3); 
+}
+
+__m256d test_mm256_maskz_shuffle_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 1, i32 5, i32 2, i32 6>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_shuffle_pd(__U, __A, __B, 3); 
+}
+
+__m128 test_mm_mask_shuffle_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_shuffle_ps(__W, __U, __A, __B, 4); 
+}
+
+__m128 test_mm_maskz_shuffle_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 4>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_shuffle_ps(__U, __A, __B, 4); 
+}
+
+__m256 test_mm256_mask_shuffle_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_shuffle_ps(__W, __U, __A, __B, 4); 
+}
+
+__m256 test_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4); 
+}
+
+__m128d test_mm_rsqrt14_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_rsqrt14_pd(__A); 
+}
+
+__m128d test_mm_mask_rsqrt14_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_mask_rsqrt14_pd(__W, __U, __A); 
+}
+
+__m128d test_mm_maskz_rsqrt14_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
+  return _mm_maskz_rsqrt14_pd(__U, __A); 
+}
+
+__m256d test_mm256_rsqrt14_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_rsqrt14_pd(__A); 
+}
+
+__m256d test_mm256_mask_rsqrt14_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_mask_rsqrt14_pd(__W, __U, __A); 
+}
+
+__m256d test_mm256_maskz_rsqrt14_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt14_pd
+  // CHECK: @llvm.x86.avx512.rsqrt14.pd.256
+  return _mm256_maskz_rsqrt14_pd(__U, __A); 
+}
+
+__m128 test_mm_rsqrt14_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_rsqrt14_ps(__A); 
+}
+
+__m128 test_mm_mask_rsqrt14_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_mask_rsqrt14_ps(__W, __U, __A); 
+}
+
+__m128 test_mm_maskz_rsqrt14_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.128
+  return _mm_maskz_rsqrt14_ps(__U, __A); 
+}
+
+__m256 test_mm256_rsqrt14_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_rsqrt14_ps(__A); 
+}
+
+__m256 test_mm256_mask_rsqrt14_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_mask_rsqrt14_ps(__W, __U, __A); 
+}
+
+__m256 test_mm256_maskz_rsqrt14_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_rsqrt14_ps
+  // CHECK: @llvm.x86.avx512.rsqrt14.ps.256
+  return _mm256_maskz_rsqrt14_ps(__U, __A); 
+}
+
+__m256 test_mm256_broadcast_f32x4(__m128 __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_broadcast_f32x4(__A); 
+}
+
+__m256 test_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_mask_broadcast_f32x4(__O, __M, __A); 
+}
+
+__m256 test_mm256_maskz_broadcast_f32x4(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x4
+  return _mm256_maskz_broadcast_f32x4(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i32x4(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_broadcast_i32x4(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_mask_broadcast_i32x4(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x4
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x4
+  return _mm256_maskz_broadcast_i32x4(__M, __A); 
+}
+
+__m256d test_mm256_mask_broadcastsd_pd(__m256d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_broadcastsd_pd(__O, __M, __A);
+}
+
+__m256d test_mm256_maskz_broadcastsd_pd(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastsd_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_broadcastsd_pd(__M, __A);
+}
+
+__m128 test_mm_mask_broadcastss_ps(__m128 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m128 test_mm_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_broadcastss_ps(__M, __A);
+}
+
+__m256 test_mm256_mask_broadcastss_ps(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_broadcastss_ps(__O, __M, __A);
+}
+
+__m256 test_mm256_maskz_broadcastss_ps(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastss_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_broadcastss_ps(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastd_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastd_epi32(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_broadcastd_epi32(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastd_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastd_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_broadcastd_epi32(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastq_epi64(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastq_epi64(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_broadcastq_epi64(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastq_epi64(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastq_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_broadcastq_epi64(__M, __A);
+}
+
+__m128i test_mm_cvtsepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.128
+  return _mm_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.128
+  return _mm_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_cvtsepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_mask_cvtsepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.256
+  return _mm256_maskz_cvtsepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.db.mem.256
+  return _mm256_mask_cvtsepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_cvtsepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.128
+  return _mm_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.128
+  return _mm_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_cvtsepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_mask_cvtsepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.256
+  return _mm256_maskz_cvtsepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi32_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.dw.mem.256
+  return _mm256_mask_cvtsepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.128
+  return _mm_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_cvtsepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_mask_cvtsepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.256
+  return _mm256_maskz_cvtsepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qb.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_cvtsepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.128
+  return _mm_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_cvtsepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_mask_cvtsepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.256
+  return _mm256_maskz_cvtsepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qd.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtsepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtsepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.128
+  return _mm_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.128
+  return _mm_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtsepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_cvtsepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtsepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_mask_cvtsepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtsepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtsepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.256
+  return _mm256_maskz_cvtsepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtsepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtsepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovs.qw.mem.256
+  return _mm256_mask_cvtsepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.128
+  return _mm_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.128
+  return _mm_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_cvtusepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_mask_cvtusepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.256
+  return _mm256_maskz_cvtusepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.db.mem.256
+  return _mm256_mask_cvtusepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_cvtusepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.128
+  return _mm_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.128
+  return _mm_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_cvtusepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_mask_cvtusepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.256
+  return _mm256_maskz_cvtusepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi32_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.dw.mem.256
+  return _mm256_mask_cvtusepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.128
+  return _mm_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_cvtusepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_mask_cvtusepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.256
+  return _mm256_maskz_cvtusepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qb.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_cvtusepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.128
+  return _mm_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_cvtusepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_mask_cvtusepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.256
+  return _mm256_maskz_cvtusepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qd.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtusepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtusepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.128
+  return _mm_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtusepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.128
+  return _mm_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtusepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_cvtusepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtusepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_mask_cvtusepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtusepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtusepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.256
+  return _mm256_maskz_cvtusepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtusepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtusepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovus.qw.mem.256
+  return _mm256_mask_cvtusepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi32_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.128
+  return _mm_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.128
+  return _mm_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi32_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_cvtepi32_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi32_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_mask_cvtepi32_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi32_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.256
+  return _mm256_maskz_cvtepi32_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi32_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.db.mem.256
+  return _mm256_mask_cvtepi32_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi32_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_cvtepi32_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi32_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.128
+  return _mm_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtepi32_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.128
+  return _mm_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi32_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_cvtepi32_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi32_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_mask_cvtepi32_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi32_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi32_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.256
+  return _mm256_maskz_cvtepi32_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi32_storeu_epi16(void *  __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi32_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.dw.mem.256
+  return _mm256_mask_cvtepi32_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi8(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi8(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.128
+  return _mm_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.128
+  return _mm_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi8(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_cvtepi64_epi8(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi8(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_mask_cvtepi64_epi8(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi8(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.256
+  return _mm256_maskz_cvtepi64_epi8(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi8(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi8
+  // CHECK: @llvm.x86.avx512.mask.pmov.qb.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi8(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_cvtepi64_epi32(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi32(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.128
+  return _mm_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.128
+  return _mm_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_cvtepi64_epi32(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi32(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_mask_cvtepi64_epi32(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi32(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.256
+  return _mm256_maskz_cvtepi64_epi32(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi32(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmov.qd.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi32(__P, __M, __A); 
+}
+
+__m128i test_mm_cvtepi64_epi16(__m128i __A) {
+  // CHECK-LABEL: @test_mm_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi64_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.128
+  return _mm_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.128
+  return _mm_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128i test_mm256_cvtepi64_epi16(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_cvtepi64_epi16(__A); 
+}
+
+__m128i test_mm256_mask_cvtepi64_epi16(__m128i __O, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_mask_cvtepi64_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm256_maskz_cvtepi64_epi16(__mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi64_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.256
+  return _mm256_maskz_cvtepi64_epi16(__M, __A); 
+}
+
+void test_mm256_mask_cvtepi64_storeu_epi16(void * __P, __mmask8 __M, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi64_storeu_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmov.qw.mem.256
+  return _mm256_mask_cvtepi64_storeu_epi16(__P, __M, __A); 
+}
+
+__m128 test_mm256_extractf32x4_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_extractf32x4_ps(__A, 1); 
+}
+
+__m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); 
+}
+
+__m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extractf32x4_ps
+  // CHECK: @llvm.x86.avx512.mask.vextractf32x4
+  return _mm256_maskz_extractf32x4_ps(__U, __A, 1); 
+}
+
+__m128i test_mm256_extracti32x4_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_extracti32x4_epi32(__A, 1); 
+}
+
+__m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); 
+}
+
+__m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extracti32x4_epi32
+  // CHECK: @llvm.x86.avx512.mask.vextracti32x4
+  return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); 
+}
+
+__m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_insertf32x4(__A, __B, 1); 
+}
+
+__m256 test_mm256_mask_insertf32x4(__m256 __W, __mmask8 __U, __m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_mask_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_mask_insertf32x4(__W, __U, __A, __B, 1); 
+}
+
+__m256 test_mm256_maskz_insertf32x4(__mmask8 __U, __m256 __A, __m128 __B) {
+  // CHECK-LABEL: @test_mm256_maskz_insertf32x4
+  // CHECK: @llvm.x86.avx512.mask.insertf32x4
+  return _mm256_maskz_insertf32x4(__U, __A, __B, 1); 
+}
+
+__m256i test_mm256_inserti32x4(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_inserti32x4(__A, __B, 1); 
+}
+
+__m256i test_mm256_mask_inserti32x4(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_mask_inserti32x4(__W, __U, __A, __B, 1); 
+}
+
+__m256i test_mm256_maskz_inserti32x4(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_inserti32x4
+  // CHECK: @llvm.x86.avx512.mask.inserti32x4
+  return _mm256_maskz_inserti32x4(__U, __A, __B, 1); 
+}
+
+__m128d test_mm_getmant_pd(__m128d __A) {
+  // CHECK-LABEL: @test_mm_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_mask_getmant_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_maskz_getmant_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.128
+  return _mm_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_getmant_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_getmant_pd(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_mask_getmant_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_mask_getmant_pd(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256d test_mm256_maskz_getmant_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_pd
+  // CHECK: @llvm.x86.avx512.mask.getmant.pd.256
+  return _mm256_maskz_getmant_pd(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_getmant_ps(__m128 __A) {
+  // CHECK-LABEL: @test_mm_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_mask_getmant_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128 test_mm_maskz_getmant_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.128
+  return _mm_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_getmant_ps(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_getmant_ps(__A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_mask_getmant_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_mask_getmant_ps(__W, __U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m256 test_mm256_maskz_getmant_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_getmant_ps
+  // CHECK: @llvm.x86.avx512.mask.getmant.ps.256
+  return _mm256_maskz_getmant_ps(__U, __A,_MM_MANT_NORM_p5_2, _MM_MANT_SIGN_nan); 
+}
+
+__m128d test_mm_mmask_i64gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather3div2.df
+  return _mm_mmask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mmask_i64gather_epi64(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3div2.di
+  return _mm_mmask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_mmask_i64gather_pd(__m256d __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_pd
+  // CHECK: @llvm.x86.avx512.gather3div4.df
+  return _mm256_mmask_i64gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mmask_i64gather_epi64(__m256i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3div4.di
+  return _mm256_mmask_i64gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather3div4.sf
+  return _mm_mmask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mmask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3div4.si
+  return _mm_mmask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm256_mmask_i64gather_ps(__m128 __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_ps
+  // CHECK: @llvm.x86.avx512.gather3div8.sf
+  return _mm256_mmask_i64gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm256_mmask_i64gather_epi32(__m128i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mmask_i64gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3div8.si
+  return _mm256_mmask_i64gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128d test_mm_mask_i32gather_pd(__m128d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather3siv2.df
+  return _mm_mmask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mask_i32gather_epi64(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3siv2.di
+  return _mm_mmask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_mask_i32gather_pd(__m256d __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_pd
+  // CHECK: @llvm.x86.avx512.gather3siv4.df
+  return _mm256_mmask_i32gather_pd(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mask_i32gather_epi64(__m256i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_epi64
+  // CHECK: @llvm.x86.avx512.gather3siv4.di
+  return _mm256_mmask_i32gather_epi64(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128 test_mm_mask_i32gather_ps(__m128 __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather3siv4.sf
+  return _mm_mmask_i32gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m128i test_mm_mask_i32gather_epi32(__m128i __v1_old, __mmask8 __mask, __m128i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3siv4.si
+  return _mm_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256 test_mm256_mask_i32gather_ps(__m256 __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_ps
+  // CHECK: @llvm.x86.avx512.gather3siv8.sf
+  return _mm256_mmask_i32gather_ps(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m256i __index, void const *__addr) {
+  // CHECK-LABEL: @test_mm256_mask_i32gather_epi32
+  // CHECK: @llvm.x86.avx512.gather3siv8.si
+  return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
+}
+
+__m256d test_mm256_permutex_pd(__m256d __X) {
+  // CHECK-LABEL: @test_mm256_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  return _mm256_permutex_pd(__X, 3);
+}
+
+__m256d test_mm256_mask_permutex_pd(__m256d __W, __mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_mask_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_permutex_pd(__W, __U, __X, 1);
+}
+
+__m256d test_mm256_maskz_permutex_pd(__mmask8 __U, __m256d __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex_pd
+  // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_permutex_pd(__U, __X, 1);
+}
+
+__m256i test_mm256_permutex_epi64(__m256i __X) {
+  // CHECK-LABEL: @test_mm256_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  return _mm256_permutex_epi64(__X, 3);
+}
+
+__m256i test_mm256_mask_permutex_epi64(__m256i __W, __mmask8 __M, __m256i __X) {
+  // CHECK-LABEL: @test_mm256_mask_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_permutex_epi64(__W, __M, __X, 3);
+}
+
+__m256i test_mm256_maskz_permutex_epi64(__mmask8 __M, __m256i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_permutex_epi64
+  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_permutex_epi64(__M, __X, 3);
+}
+
+__m256d test_mm256_permutexvar_pd(__m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_permutexvar_pd(__X, __Y);
+}
+
+__m256d test_mm256_mask_permutexvar_pd(__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_mask_permutexvar_pd(__W, __U, __X, __Y);
+}
+
+__m256d test_mm256_maskz_permutexvar_pd(__mmask8 __U, __m256i __X, __m256d __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_pd
+  // CHECK: @llvm.x86.avx512.mask.permvar.df.256
+  return _mm256_maskz_permutexvar_pd(__U, __X, __Y);
+}
+
+__m256i test_mm256_maskz_permutexvar_epi64(__mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  return _mm256_maskz_permutexvar_epi64(__M, __X, __Y);
+}
+
+__m256i test_mm256_mask_permutexvar_epi64(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi64
+  // CHECK: @llvm.x86.avx512.mask.permvar.di.256
+  return _mm256_mask_permutexvar_epi64(__W, __M, __X, __Y);
+}
+
+__m256 test_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_mask_permutexvar_ps(__W, __U, __X, __Y);
+}
+
+__m256 test_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_maskz_permutexvar_ps(__U, __X, __Y);
+}
+
+__m256 test_mm256_permutexvar_ps(__m256i __X, __m256 __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_ps
+  // CHECK: @llvm.x86.avx512.mask.permvar.sf.256
+  return _mm256_permutexvar_ps( __X, __Y);
+}
+
+__m256i test_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_maskz_permutexvar_epi32(__M, __X, __Y);
+}
+
+__m256i test_mm256_permutexvar_epi32(__m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_permutexvar_epi32(__X, __Y);
+}
+
+__m256i test_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi32
+  // CHECK: @llvm.x86.avx512.mask.permvar.si.256
+  return _mm256_mask_permutexvar_epi32(__W, __M, __X, __Y);
+}
+
+__m128i test_mm_alignr_epi32(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_alignr_epi32(__A, __B, 1);
+}
+
+__m128i test_mm_mask_alignr_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_mask_alignr_epi32(__W, __U, __A, __B, 1);
+}
+
+__m128i test_mm_maskz_alignr_epi32(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.128
+  return _mm_maskz_alignr_epi32(__U, __A, __B, 1);
+}
+
+__m256i test_mm256_alignr_epi32(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_alignr_epi32(__A, __B, 1);
+}
+
+__m256i test_mm256_mask_alignr_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_mask_alignr_epi32(__W, __U, __A, __B, 1);
+}
+
+__m256i test_mm256_maskz_alignr_epi32(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi32
+  // CHECK: @llvm.x86.avx512.mask.valign.d.256
+  return _mm256_maskz_alignr_epi32(__U, __A, __B, 1);
+}
+
+__m128i test_mm_alignr_epi64(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_alignr_epi64(__A, __B, 1);
+}
+
+__m128i test_mm_mask_alignr_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_mask_alignr_epi64(__W, __U, __A, __B, 1);
+}
+
+__m128i test_mm_maskz_alignr_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.128
+  return _mm_maskz_alignr_epi64(__U, __A, __B, 1);
+}
+
+__m256i test_mm256_alignr_epi64(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_alignr_epi64(__A, __B, 1);
+}
+
+__m256i test_mm256_mask_alignr_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_mask_alignr_epi64(__W, __U, __A, __B, 1);
+}
+
+__m256i test_mm256_maskz_alignr_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi64
+  // CHECK: @llvm.x86.avx512.mask.valign.q.256
+  return _mm256_maskz_alignr_epi64(__U, __A, __B, 1);
+}
+
+__m128 test_mm_mask_movehdup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_movehdup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_movehdup_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_movehdup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_movehdup_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_movehdup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_movehdup_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_movehdup_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_movehdup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_movehdup_ps(__U, __A);
+}
+
+__m128 test_mm_mask_moveldup_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_moveldup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_moveldup_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_moveldup_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  // CHECK: select <4 x i1> %{{.*}} <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_moveldup_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_moveldup_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_moveldup_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_moveldup_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_moveldup_ps
+  // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  // CHECK: select <8 x i1> %{{.*}} <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_moveldup_ps(__U, __A);
+}
+
+__m128i test_mm_mask_shuffle_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shuffle_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_shuffle_epi32(__W, __U, __A, 1);
+}
+
+__m128i test_mm_maskz_shuffle_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shuffle_epi32
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 2, i32 0, i32 0, i32 0>
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_shuffle_epi32(__U, __A, 2);
+}
+
+__m256i test_mm256_mask_shuffle_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_shuffle_epi32(__W, __U, __A, 2);
+}
+
+__m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shuffle_epi32
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 2, i32 0, i32 0, i32 0, i32 6, i32 4, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_shuffle_epi32(__U, __A, 2);
+}
+
+__m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_mask_mov_pd(__W, __U, __A);
+}
+
+__m128d test_mm_maskz_mov_pd(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_pd
+  // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
+  return _mm_maskz_mov_pd(__U, __A);
+}
+
+__m256d test_mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_pd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_mask_mov_pd(__W, __U, __A);
+}
+
+__m256d test_mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_pd
+  // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
+  return _mm256_maskz_mov_pd(__U, __A);
+}
+
+__m128 test_mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_mask_mov_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_mov_ps(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_ps
+  // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
+  return _mm_maskz_mov_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_ps
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_mask_mov_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_ps
+  // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
+  return _mm256_maskz_mov_ps(__U, __A);
+}
+
+__m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  return _mm_mask_cvtph_ps(__W, __U, __A);
+}
+
+__m128 test_mm_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.128
+  return _mm_maskz_cvtph_ps(__U, __A);
+}
+
+__m256 test_mm256_mask_cvtph_ps(__m256 __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  return _mm256_mask_cvtph_ps(__W, __U, __A);
+}
+
+__m256 test_mm256_maskz_cvtph_ps(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtph_ps
+  // CHECK: @llvm.x86.avx512.mask.vcvtph2ps.256
+  return _mm256_maskz_cvtph_ps(__U, __A);
+}
+
+__m128i test_mm_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_mask_cvtps_ph(__W, __U, __A);
+}
+
+__m128i test_mm_maskz_cvtps_ph(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_maskz_cvtps_ph(__U, __A);
+}
+
+__m128i test_mm256_mask_cvtps_ph(__m128i __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_mask_cvtps_ph(__W, __U, __A);
+}
+
+__m128i test_mm256_maskz_cvtps_ph(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_maskz_cvtps_ph(__U, __A);
+}
+
+__m128i test_mm_mask_cvt_roundps_ph(__m128i __W, __mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm_maskz_cvt_roundps_ph(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.128
+  return _mm_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm256_mask_cvt_roundps_ph(__m128i __W, __mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_mask_cvt_roundps_ph(__W, __U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__m128i test_mm256_maskz_cvt_roundps_ph(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvt_roundps_ph
+  // CHECK: @llvm.x86.avx512.mask.vcvtps2ph.256
+  return _mm256_maskz_cvt_roundps_ph(__U, __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+__mmask8 test_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_epi32_mask
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpeq_epi32_mask
+  // CHECK: icmp eq <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpeq_epi64_mask
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_epi64_mask
+  // CHECK: icmp eq <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_epi32_mask
+  // CHECK: icmp sgt <4 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpgt_epi32_mask
+  // CHECK: icmp sgt <4 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_mask_cmpgt_epi64_mask
+  // CHECK: icmp sgt <2 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <2 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_epi64_mask
+  // CHECK: icmp sgt <2 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpeq_epi32_mask
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpeq_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpeq_epi32_mask
+  // CHECK: icmp eq <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpeq_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpeq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpeq_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpeq_epi64_mask
+  // CHECK: icmp eq <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpeq_epi64_mask(__a, __b);
+}
+
+__mmask8 test_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpgt_epi32_mask
+  // CHECK: icmp sgt <8 x i32> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpgt_epi32_mask(__a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpgt_epi32_mask
+  // CHECK: icmp sgt <8 x i32> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpgt_epi32_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_mask_cmpgt_epi64_mask
+  // CHECK: icmp sgt <4 x i64> %{{.*}}, %{{.*}}
+  // CHECK: and <4 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_mask_cmpgt_epi64_mask(__u, __a, __b);
+}
+
+__mmask8 test_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) {
+  // CHECK-LABEL: @test_mm256_cmpgt_epi64_mask
+  // CHECK: icmp sgt <4 x i64> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm256_cmpgt_epi64_mask(__a, __b);
+}
diff --git a/test/CodeGen/avx512vlbw-builtins.c b/test/CodeGen/avx512vlbw-builtins.c
index 11155f6cf818e..6bfa09f5d1c67 100644
--- a/test/CodeGen/avx512vlbw-builtins.c
+++ b/test/CodeGen/avx512vlbw-builtins.c
@@ -8,674 +8,730 @@
 
 __mmask32 test_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.256
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask32 test_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.256
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.128
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_cmpeq_epi8_mask(__a, __b);
 }
 
 __mmask16 test_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.b.128
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_mask_cmpeq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.256
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask16 test_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.256
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.128
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpeq_epi16_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpeq.w.128
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpeq_epi16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.256
+  // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask32 test_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.256
+  // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
   return (__mmask32)_mm256_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.128
+  // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_cmpgt_epi8_mask(__a, __b);
 }
 
 __mmask16 test_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.b.128
+  // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm_mask_cmpgt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.256
+  // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask16 test_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.256
+  // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
   return (__mmask16)_mm256_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.128
+  // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_cmpgt_epi16_mask(__a, __b);
 }
 
 __mmask8 test_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.pcmpgt.w.128
+  // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
   return (__mmask8)_mm_mask_cmpgt_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 0, i16 -1)
-  return (__mmask64)_mm_cmpeq_epu8_mask(__a, __b);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpeq_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpeq_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 0, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpeq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 0, i8 -1)
-  return (__mmask32)_mm_cmpeq_epu16_mask(__a, __b);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpeq_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpeq_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 0, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpeq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 0, i32 -1)
-  return (__mmask64)_mm256_cmpeq_epu8_mask(__a, __b);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpeq_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpeq_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 0, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpeq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpeq_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 0, i16 -1)
-  return (__mmask32)_mm256_cmpeq_epu16_mask(__a, __b);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpeq_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpeq_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpeq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 0, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpeq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpeq_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 6, i16 -1)
-  return (__mmask64)_mm_cmpgt_epu8_mask(__a, __b);
+  // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpgt_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpgt_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 6, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpgt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 6, i8 -1)
-  return (__mmask32)_mm_cmpgt_epu16_mask(__a, __b);
+  // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpgt_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpgt_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 6, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpgt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 6, i32 -1)
-  return (__mmask64)_mm256_cmpgt_epu8_mask(__a, __b);
+  // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpgt_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpgt_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 6, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpgt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpgt_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 6, i16 -1)
-  return (__mmask32)_mm256_cmpgt_epu16_mask(__a, __b);
+  // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpgt_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpgt_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpgt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 6, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpgt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpgt_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 -1)
-  return (__mmask64)_mm_cmpge_epi8_mask(__a, __b);
+  // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpge_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpge_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpge_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 -1)
-  return (__mmask64)_mm_cmpge_epu8_mask(__a, __b);
+  // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpge_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpge_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpge_epu8_mask(__u, __a, __b);
+  // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 -1)
-  return (__mmask32)_mm_cmpge_epi16_mask(__a, __b);
+  // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpge_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpge_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpge_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 -1)
-  return (__mmask32)_mm_cmpge_epu16_mask(__a, __b);
+  // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpge_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpge_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 5, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpge_epu16_mask(__u, __a, __b);
+  // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 -1)
-  return (__mmask64)_mm256_cmpge_epi8_mask(__a, __b);
+  // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpge_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpge_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpge_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpge_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 -1)
-  return (__mmask64)_mm256_cmpge_epu8_mask(__a, __b);
+  // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpge_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpge_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 5, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpge_epu8_mask(__u, __a, __b);
+  // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpge_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 -1)
-  return (__mmask32)_mm256_cmpge_epi16_mask(__a, __b);
+  // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpge_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpge_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpge_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpge_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 -1)
-  return (__mmask32)_mm256_cmpge_epu16_mask(__a, __b);
+  // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpge_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpge_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpge_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 5, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpge_epu16_mask(__u, __a, __b);
+  // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpge_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmple_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 -1)
-  return (__mmask64)_mm_cmple_epi8_mask(__a, __b);
+  // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmple_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmple_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmple_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmple_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 -1)
-  return (__mmask64)_mm_cmple_epu8_mask(__a, __b);
+  // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmple_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmple_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmple_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 -1)
-  return (__mmask32)_mm_cmple_epi16_mask(__a, __b);
+  // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmple_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmple_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmple_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmple_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 -1)
-  return (__mmask32)_mm_cmple_epu16_mask(__a, __b);
+  // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmple_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmple_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 2, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmple_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 -1)
-  return (__mmask64)_mm256_cmple_epi8_mask(__a, __b);
+  // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmple_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmple_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmple_epi8_mask(__u, __a, __b);
+  // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmple_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 -1)
-  return (__mmask64)_mm256_cmple_epu8_mask(__a, __b);
+  // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmple_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmple_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 2, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmple_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmple_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 -1)
-  return (__mmask32)_mm256_cmple_epi16_mask(__a, __b);
+  // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmple_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmple_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmple_epi16_mask(__u, __a, __b);
+  // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmple_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 -1)
-  return (__mmask32)_mm256_cmple_epu16_mask(__a, __b);
+  // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmple_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmple_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmple_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 2, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmple_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmple_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 -1)
-  return (__mmask64)_mm_cmplt_epi8_mask(__a, __b);
+  // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmplt_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmplt_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmplt_epi8_mask(__u, __a, __b);
+  // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 -1)
-  return (__mmask64)_mm_cmplt_epu8_mask(__a, __b);
+  // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmplt_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmplt_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmplt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 -1)
-  return (__mmask32)_mm_cmplt_epi16_mask(__a, __b);
+  // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmplt_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmplt_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmplt_epi16_mask(__u, __a, __b);
+  // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 -1)
-  return (__mmask32)_mm_cmplt_epu16_mask(__a, __b);
+  // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmplt_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmplt_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 1, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmplt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 -1)
-  return (__mmask64)_mm256_cmplt_epi8_mask(__a, __b);
+  // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmplt_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmplt_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmplt_epi8_mask(__u, __a, __b);
+  // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmplt_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 -1)
-  return (__mmask64)_mm256_cmplt_epu8_mask(__a, __b);
+  // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmplt_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmplt_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 1, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmplt_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmplt_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 -1)
-  return (__mmask32)_mm256_cmplt_epi16_mask(__a, __b);
+  // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmplt_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmplt_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmplt_epi16_mask(__u, __a, __b);
+  // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmplt_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 -1)
-  return (__mmask32)_mm256_cmplt_epu16_mask(__a, __b);
+  // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmplt_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmplt_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmplt_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 1, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmplt_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmplt_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 -1)
-  return (__mmask64)_mm_cmpneq_epi8_mask(__a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpneq_epi8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpneq_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpneq_epi8_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 -1)
-  return (__mmask64)_mm_cmpneq_epu8_mask(__a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmpneq_epu8_mask(__a, __b);
 }
 
-__mmask16 test_mm_mask_cmpneq_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmpneq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 -1)
-  return (__mmask32)_mm_cmpneq_epi16_mask(__a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpneq_epi16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpneq_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpneq_epi16_mask(__u, __a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask8 test_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 -1)
-  return (__mmask32)_mm_cmpneq_epu16_mask(__a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmpneq_epu16_mask(__a, __b);
 }
 
-__mmask8 test_mm_mask_cmpneq_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 4, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmpneq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 -1)
-  return (__mmask64)_mm256_cmpneq_epi8_mask(__a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpneq_epi8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpneq_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpneq_epi8_mask(__u, __a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpneq_epi8_mask(__u, __a, __b);
 }
 
 __mmask32 test_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 -1)
-  return (__mmask64)_mm256_cmpneq_epu8_mask(__a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmpneq_epu8_mask(__a, __b);
 }
 
-__mmask32 test_mm256_mask_cmpneq_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 4, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmpneq_epu8_mask(__u, __a, __b);
+  // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmpneq_epu8_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 -1)
-  return (__mmask32)_mm256_cmpneq_epi16_mask(__a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpneq_epi16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpneq_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpneq_epi16_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpneq_epi16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 -1)
-  return (__mmask32)_mm256_cmpneq_epu16_mask(__a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmpneq_epu16_mask(__a, __b);
 }
 
-__mmask16 test_mm256_mask_cmpneq_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmpneq_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 4, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmpneq_epu16_mask(__u, __a, __b);
+  // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmpneq_epu16_mask(__u, __a, __b);
 }
 
 __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 -1)
-  return (__mmask64)_mm_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm_mask_cmp_epi8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm_cmp_epu8_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 -1)
-  return (__mmask64)_mm_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_cmp_epu8_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm_mask_cmp_epu8_mask(__mmask64 __u, __m128i __a, __m128i __b) {
+__mmask16 test_mm_mask_cmp_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> {{.*}}, <16 x i8> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask64)_mm_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epi16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 -1)
-  return (__mmask32)_mm_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epi16_mask(__a, __b, 0);
 }
 
-__mmask8 test_mm_mask_cmp_epi16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmp_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask8 test_mm_cmp_epu16_mask(__m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 -1)
-  return (__mmask32)_mm_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_cmp_epu16_mask(__a, __b, 0);
 }
 
-__mmask8 test_mm_mask_cmp_epu16_mask(__mmask32 __u, __m128i __a, __m128i __b) {
+__mmask8 test_mm_mask_cmp_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) {
   // CHECK-LABEL: @test_mm_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> {{.*}}, <8 x i16> {{.*}}, i32 7, i8 {{.*}})
-  return (__mmask32)_mm_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <8 x i1> %{{.*}}, %{{.*}}
+  return (__mmask8)_mm_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm256_cmp_epi8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 -1)
-  return (__mmask64)_mm256_cmp_epi8_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epi8_mask(__a, __b, 0);
 }
 
-__mmask32 test_mm256_mask_cmp_epi8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmp_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi8_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmp_epi8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epi8_mask(__u, __a, __b, 0);
 }
 
 __mmask32 test_mm256_cmp_epu8_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 -1)
-  return (__mmask64)_mm256_cmp_epu8_mask(__a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_cmp_epu8_mask(__a, __b, 0);
 }
 
-__mmask32 test_mm256_mask_cmp_epu8_mask(__mmask64 __u, __m256i __a, __m256i __b) {
+__mmask32 test_mm256_mask_cmp_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu8_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8> {{.*}}, <32 x i8> {{.*}}, i32 7, i32 {{.*}})
-  return (__mmask64)_mm256_mask_cmp_epu8_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}}
+  // CHECK: and <32 x i1> %{{.*}}, %{{.*}}
+  return (__mmask32)_mm256_mask_cmp_epu8_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm256_cmp_epi16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 -1)
-  return (__mmask32)_mm256_cmp_epi16_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epi16_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm256_mask_cmp_epi16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmp_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epi16_mask
-  // CHECK: @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmp_epi16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epi16_mask(__u, __a, __b, 0);
 }
 
 __mmask16 test_mm256_cmp_epu16_mask(__m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 -1)
-  return (__mmask32)_mm256_cmp_epu16_mask(__a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_cmp_epu16_mask(__a, __b, 0);
 }
 
-__mmask16 test_mm256_mask_cmp_epu16_mask(__mmask32 __u, __m256i __a, __m256i __b) {
+__mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) {
   // CHECK-LABEL: @test_mm256_mask_cmp_epu16_mask
-  // CHECK: @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> {{.*}}, <16 x i16> {{.*}}, i32 7, i16 {{.*}})
-  return (__mmask32)_mm256_mask_cmp_epu16_mask(__u, __a, __b, 7);
+  // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}}
+  // CHECK: and <16 x i1> %{{.*}}, %{{.*}}
+  return (__mmask16)_mm256_mask_cmp_epu16_mask(__u, __a, __b, 0);
 }
 
 
@@ -800,24 +856,24 @@ __m128i test_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) {
 
 __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.128
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_blend_epi8(__U,__A,__W); 
 }
 __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi8
-  // CHECK: @llvm.x86.avx512.mask.blend.b.256
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_blend_epi8(__U,__A,__W); 
 }
 
 __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) {
   // CHECK-LABEL: @test_mm_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.128
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_blend_epi16(__U,__A,__W); 
 }
 
 __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) {
   // CHECK-LABEL: @test_mm256_mask_blend_epi16
-  // CHECK: @llvm.x86.avx512.mask.blend.w.256
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_blend_epi16(__U,__A,__W); 
 }
 
@@ -1611,97 +1667,902 @@ __m256i test_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
 
 __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpckhb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpackhi_epi8(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpckhw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpackhi_epi16(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.128
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
   return _mm_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi8
-  // CHECK: @llvm.x86.avx512.mask.punpcklb.w.256
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
   return _mm256_maskz_unpacklo_epi8(__U, __A, __B); 
 }
 
 __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: @test_mm_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.128
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
   return _mm_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
 __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); 
 }
 
 __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi16
-  // CHECK: @llvm.x86.avx512.mask.punpcklw.d.256
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_unpacklo_epi16(__U, __A, __B); 
 }
 
+__m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  return _mm_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.128
+  return _mm_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  return _mm256_mask_cvtepi8_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovsxb.w.256
+  return _mm256_maskz_cvtepi8_epi16(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.128
+  return _mm_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_mask_cvtepu8_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.d.256
+  return _mm256_maskz_cvtepu8_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.128
+  return _mm_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_mask_cvtepu8_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.q.256
+  return _mm256_maskz_cvtepu8_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m128i test_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.128
+  return _mm_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m256i test_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_mask_cvtepu32_epi64(__W, __U, __X); 
+}
+
+__m256i test_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu32_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxd.q.256
+  return _mm256_maskz_cvtepu32_epi64(__U, __X); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.128
+  return _mm_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_mask_cvtepu16_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi32
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.d.256
+  return _mm256_maskz_cvtepu16_epi32(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.128
+  return _mm_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_mask_cvtepu16_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu16_epi64
+  // CHECK: @llvm.x86.avx512.mask.pmovzxw.q.256
+  return _mm256_maskz_cvtepu16_epi64(__U, __A); 
+}
+
+__m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  return _mm_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.128
+  return _mm_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  return _mm256_mask_cvtepu8_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi16
+  // CHECK: @llvm.x86.avx512.mask.pmovzxb.w.256
+  return _mm256_maskz_cvtepu8_epi16(__U, __A); 
+}
+
+__m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_sllv_epi16(__A, __B); 
+}
+
+__m256i test_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm256_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_sllv_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_sllv_epi16(__A, __B); 
+}
+
+__m128i test_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_mask_sllv_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sllv_epi16
+  // CHECK: @llvm.x86.avx512.mask.psllv
+  return _mm_maskz_sllv_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  return _mm_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m128i test_mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.128
+  return _mm_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m256i test_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  return _mm256_mask_sll_epi16(__W, __U, __A, __B); 
+}
+
+__m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_sll_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.w.256
+  return _mm256_maskz_sll_epi16(__U, __A, __B); 
+}
+
+__m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  return _mm_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_slli_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.128
+  return _mm_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  return _mm256_mask_slli_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_slli_epi16
+  // CHECK: @llvm.x86.avx512.mask.psll.wi.256
+  return _mm256_maskz_slli_epi16(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi16
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_mov_epi16(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_mov_epi16(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi16
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_mov_epi16(__U, __A); 
+}
+
+__m128i test_mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_mov_epi8
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_mov_epi8
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_mov_epi8(__U, __A); 
+}
+
+__m256i test_mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_mov_epi8
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_mov_epi8(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_mov_epi8
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_mov_epi8(__U, __A); 
+}
+
+__m128i test_mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}})
+  return _mm_maskz_loadu_epi16(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi16
+  // CHECK: @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_mask_loadu_epi16(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi16
+  // CHECK: @llvm.masked.load.v16i16.p0v16i16(<16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}})
+  return _mm256_maskz_loadu_epi16(__U, __P); 
+}
+
+__m128i test_mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m128i test_mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}})
+  return _mm_maskz_loadu_epi8(__U, __P); 
+}
+
+__m256i test_mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_mask_loadu_epi8
+  // CHECK: @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_mask_loadu_epi8(__W, __U, __P); 
+}
+
+__m256i test_mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) {
+  // CHECK-LABEL: @test_mm256_maskz_loadu_epi8
+  // CHECK: @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}})
+  return _mm256_maskz_loadu_epi8(__U, __P); 
+}
+
+void test_mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %{{.*}}, <8 x i16>* %{{.*}}, i32 1, <8 x i1> %{{.*}})
+  return _mm_mask_storeu_epi16(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi16
+  // CHECK: @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %{{.*}}, <16 x i16>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi16(__P, __U, __A); 
+}
+
+void test_mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %{{.*}})
+  return _mm_mask_storeu_epi8(__P, __U, __A); 
+}
+
+void test_mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_storeu_epi8
+  // CHECK: @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %{{.*}}, <32 x i8>* %{{.*}}, i32 1, <32 x i1> %{{.*}})
+  return _mm256_mask_storeu_epi8(__P, __U, __A); 
+}
+__mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.128
+  return _mm_test_epi8_mask(__A, __B); 
+}
+
+__mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.128
+  return _mm_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.256
+  return _mm256_test_epi8_mask(__A, __B); 
+}
+
+__mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestm.b.256
+  return _mm256_mask_test_epi8_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_test_epi16_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.128
+  return _mm_test_epi16_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.128
+  return _mm_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm256_test_epi16_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.256
+  return _mm256_test_epi16_mask(__A, __B); 
+}
+
+__mmask16 test_mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_test_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestm.w.256
+  return _mm256_mask_test_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.128
+  return _mm_testn_epi8_mask(__A, __B); 
+}
+
+__mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.128
+  return _mm_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask32 test_mm256_testn_epi8_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.256
+  return _mm256_testn_epi8_mask(__A, __B); 
+}
+
+__mmask32 test_mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi8_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.b.256
+  return _mm256_mask_testn_epi8_mask(__U, __A, __B); 
+}
+
+__mmask8 test_mm_testn_epi16_mask(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.128
+  return _mm_testn_epi16_mask(__A, __B); 
+}
+
+__mmask8 test_mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.128
+  return _mm_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm256_testn_epi16_mask(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.256
+  return _mm256_testn_epi16_mask(__A, __B); 
+}
+
+__mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_testn_epi16_mask
+  // CHECK: @llvm.x86.avx512.ptestnm.w.256
+  return _mm256_mask_testn_epi16_mask(__U, __A, __B); 
+}
+
+__mmask16 test_mm_movepi8_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.128
+  return _mm_movepi8_mask(__A); 
+}
+
+__mmask32 test_mm256_movepi8_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi8_mask
+  // CHECK: @llvm.x86.avx512.cvtb2mask.256
+  return _mm256_movepi8_mask(__A); 
+}
+
+__m128i test_mm_movm_epi8(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.128
+  return _mm_movm_epi8(__A); 
+}
+
+__m256i test_mm256_movm_epi8(__mmask32 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi8
+  // CHECK: @llvm.x86.avx512.cvtmask2b.256
+  return _mm256_movm_epi8(__A); 
+}
+
+__m128i test_mm_movm_epi16(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.128
+  return _mm_movm_epi16(__A); 
+}
+
+__m256i test_mm256_movm_epi16(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi16
+  // CHECK: @llvm.x86.avx512.cvtmask2w.256
+  return _mm256_movm_epi16(__A); 
+}
+
+__m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_broadcastb_epi8(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastb_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_broadcastb_epi8(__M, __A);
+}
+
+__m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_broadcastw_epi16(__O, __M, __A);
+}
+
+__m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcastw_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_broadcastw_epi16(__M, __A);
+}
+
+__m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) {
+  // CHECK-LABEL: @test_mm256_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.256
+  return _mm256_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) {
+  // CHECK-LABEL: @test_mm256_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.256
+  return _mm256_maskz_set1_epi16(__M, __A); 
+}
+
+__m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) {
+  // CHECK-LABEL: @test_mm_mask_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.128
+  return _mm_mask_set1_epi16(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) {
+  // CHECK-LABEL: @test_mm_maskz_set1_epi16
+  // CHECK: @llvm.x86.avx512.mask.pbroadcast.w.gpr.128
+  return _mm_maskz_set1_epi16(__M, __A); 
+}
+__m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_permutexvar_epi16(__A, __B); 
+}
+
+__m128i test_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m128i test_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.128
+  return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+
+__m256i test_mm256_permutexvar_epi16(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_permutexvar_epi16(__A, __B); 
+}
+
+__m256i test_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_maskz_permutexvar_epi16(__M, __A, __B); 
+}
+
+__m256i test_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_permutexvar_epi16
+  // CHECK: @llvm.x86.avx512.mask.permvar.hi.256
+  return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); 
+}
+__m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_alignr_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_alignr_epi8
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
+  return _mm_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+__m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_alignr_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); 
+}
+
+__m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_alignr_epi8
+  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
+  // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
+  return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); 
+}
+
+__m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_dbsad_epu8(__A, __B, 170); 
+}
+
+__m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.128
+  return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+
+__m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_dbsad_epu8(__A, __B, 170); 
+}
+
+__m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_mask_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); 
+}
+
+__m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_dbsad_epu8
+  // CHECK: @llvm.x86.avx512.mask.dbpsadbw.256
+  return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); 
+}
+__mmask8 test_mm_movepi16_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.128
+  return _mm_movepi16_mask(__A); 
+}
+
+__mmask16 test_mm256_movepi16_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi16_mask
+  // CHECK: @llvm.x86.avx512.cvtw2mask.256
+  return _mm256_movepi16_mask(__A); 
+}
+
+__m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shufflehi_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_shufflehi_epi16(__mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shufflehi_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_shufflelo_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m128i test_mm_maskz_shufflelo_epi16(__mmask32 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_shufflelo_epi16
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}}
+  return _mm_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_shufflehi_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shufflehi_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 5, i32 5, i32 4, i32 4, i32 8, i32 9, i32 10, i32 11, i32 13, i32 13, i32 12, i32 12>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_shufflehi_epi16(__U, __A, 5); 
+}
+
+__m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); 
+}
+
+__m256i test_mm256_maskz_shufflelo_epi16(__mmask32 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_shufflelo_epi16
+  // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 1, i32 1, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7, i32 9, i32 9, i32 8, i32 8, i32 12, i32 13, i32 14, i32 15>
+  // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
+  return _mm256_maskz_shufflelo_epi16(__U, __A, 5); 
+}
+
+void test_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.128
+ _mm_mask_cvtepi16_storeu_epi8 (__P, __M, __A);
+}
+
+void test_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.128
+  _mm_mask_cvtsepi16_storeu_epi8 ( __P,  __M, __A);
+}
+
+void test_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ // CHECK-LABEL:@test_mm_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.128
+  _mm_mask_cvtusepi16_storeu_epi8 (__P, __M, __A);
+}
+
+void test_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtusepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.256
+  _mm256_mask_cvtusepi16_storeu_epi8 ( __P, __M, __A);
+}
+
+void test_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.256
+  _mm256_mask_cvtepi16_storeu_epi8 ( __P,  __M, __A);
+}
+
+void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ // CHECK-LABEL:@test_mm256_mask_cvtsepi16_storeu_epi8
+ // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
+ _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
+}
diff --git a/test/CodeGen/avx512vlcd-builtins.c b/test/CodeGen/avx512vlcd-builtins.c
new file mode 100644
index 0000000000000..f69da039a4fb9
--- /dev/null
+++ b/test/CodeGen/avx512vlcd-builtins.c
@@ -0,0 +1,182 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +avx512vl -target-feature +avx512cd -emit-llvm -o - -Werror | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+
+__m128i test_mm_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.128
+  return _mm_broadcastmb_epi64(__A); 
+}
+
+__m256i test_mm256_broadcastmb_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_broadcastmb_epi64
+  // CHECK: @llvm.x86.avx512.broadcastmb.256
+  return _mm256_broadcastmb_epi64(__A); 
+}
+
+__m128i test_mm_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.128
+  return _mm_broadcastmw_epi32(__A); 
+}
+
+__m256i test_mm256_broadcastmw_epi32(__mmask16 __A) {
+  // CHECK-LABEL: @test_mm256_broadcastmw_epi32
+  // CHECK: @llvm.x86.avx512.broadcastmw.256
+  return _mm256_broadcastmw_epi32(__A); 
+}
+
+__m128i test_mm_conflict_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_conflict_epi64(__A); 
+}
+
+__m128i test_mm_mask_conflict_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_mask_conflict_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_conflict_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.128
+  return _mm_maskz_conflict_epi64(__U, __A); 
+}
+
+__m256i test_mm256_conflict_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_conflict_epi64(__A); 
+}
+
+__m256i test_mm256_mask_conflict_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_mask_conflict_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_conflict_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_conflict_epi64
+  // CHECK: @llvm.x86.avx512.mask.conflict.q.256
+  return _mm256_maskz_conflict_epi64(__U, __A); 
+}
+
+__m128i test_mm_conflict_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_conflict_epi32(__A); 
+}
+
+__m128i test_mm_mask_conflict_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_mask_conflict_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_conflict_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.128
+  return _mm_maskz_conflict_epi32(__U, __A); 
+}
+
+__m256i test_mm256_conflict_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_conflict_epi32(__A); 
+}
+
+__m256i test_mm256_mask_conflict_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_mask_conflict_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_conflict_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_conflict_epi32
+  // CHECK: @llvm.x86.avx512.mask.conflict.d.256
+  return _mm256_maskz_conflict_epi32(__U, __A); 
+}
+
+__m128i test_mm_lzcnt_epi32(__m128i __A) {
+  // CHECK-LABEL: @test_mm_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  return _mm_lzcnt_epi32(__A); 
+}
+
+__m128i test_mm_mask_lzcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_mask_lzcnt_epi32(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_lzcnt_epi32(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_lzcnt_epi32
+  // CHECK: call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
+  return _mm_maskz_lzcnt_epi32(__U, __A); 
+}
+
+__m256i test_mm256_lzcnt_epi32(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  return _mm256_lzcnt_epi32(__A); 
+}
+
+__m256i test_mm256_mask_lzcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_mask_lzcnt_epi32(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_lzcnt_epi32(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_lzcnt_epi32
+  // CHECK: call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %{{.*}}, i1 false)
+  // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
+  return _mm256_maskz_lzcnt_epi32(__U, __A); 
+}
+
+__m128i test_mm_lzcnt_epi64(__m128i __A) {
+  // CHECK-LABEL: @test_mm_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  return _mm_lzcnt_epi64(__A); 
+}
+
+__m128i test_mm_mask_lzcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_mask_lzcnt_epi64(__W, __U, __A); 
+}
+
+__m128i test_mm_maskz_lzcnt_epi64(__mmask8 __U, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_lzcnt_epi64
+  // CHECK: call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %{{.*}}, i1 false)
+  // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
+  return _mm_maskz_lzcnt_epi64(__U, __A); 
+}
+
+__m256i test_mm256_lzcnt_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  return _mm256_lzcnt_epi64(__A); 
+}
+
+__m256i test_mm256_mask_lzcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_mask_lzcnt_epi64(__W, __U, __A); 
+}
+
+__m256i test_mm256_maskz_lzcnt_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_lzcnt_epi64
+  // CHECK: call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %{{.*}}, i1 false)
+  // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
+  return _mm256_maskz_lzcnt_epi64(__U, __A); 
+}
diff --git a/test/CodeGen/avx512vldq-builtins.c b/test/CodeGen/avx512vldq-builtins.c
index 69bdc7a110455..fa1e0623e1937 100644
--- a/test/CodeGen/avx512vldq-builtins.c
+++ b/test/CodeGen/avx512vldq-builtins.c
@@ -808,3 +808,262 @@ __m256 test_mm256_maskz_reduce_ps(__mmask8 __U, __m256 __A) {
   // CHECK: @llvm.x86.avx512.mask.reduce.ps.256
   return _mm256_maskz_reduce_ps(__U, __A, 4); 
 }
+
+__mmask8 test_mm_movepi32_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.128
+  return _mm_movepi32_mask(__A); 
+}
+
+__mmask8 test_mm256_movepi32_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi32_mask
+  // CHECK: @llvm.x86.avx512.cvtd2mask.256
+  return _mm256_movepi32_mask(__A); 
+}
+
+__m128i test_mm_movm_epi32(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.128
+  return _mm_movm_epi32(__A); 
+}
+
+__m256i test_mm256_movm_epi32(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi32
+  // CHECK: @llvm.x86.avx512.cvtmask2d.256
+  return _mm256_movm_epi32(__A); 
+}
+
+__m128i test_mm_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.128
+  return _mm_movm_epi64(__A); 
+}
+
+__m256i test_mm256_movm_epi64(__mmask8 __A) {
+  // CHECK-LABEL: @test_mm256_movm_epi64
+  // CHECK: @llvm.x86.avx512.cvtmask2q.256
+  return _mm256_movm_epi64(__A); 
+}
+
+__mmask8 test_mm_movepi64_mask(__m128i __A) {
+  // CHECK-LABEL: @test_mm_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.128
+  return _mm_movepi64_mask(__A); 
+}
+
+__mmask8 test_mm256_movepi64_mask(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_movepi64_mask
+  // CHECK: @llvm.x86.avx512.cvtq2mask.256
+  return _mm256_movepi64_mask(__A); 
+}
+
+
+__m256 test_mm256_broadcast_f32x2(__m128 __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_broadcast_f32x2(__A); 
+}
+
+__m256 test_mm256_mask_broadcast_f32x2(__m256 __O, __mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_mask_broadcast_f32x2(__O, __M, __A); 
+}
+
+__m256 test_mm256_maskz_broadcast_f32x2(__mmask8 __M, __m128 __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf32x2
+  return _mm256_maskz_broadcast_f32x2(__M, __A); 
+}
+
+__m256d test_mm256_broadcast_f64x2(__m128d __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_broadcast_f64x2(__A); 
+}
+
+__m256d test_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_mask_broadcast_f64x2(__O, __M, __A); 
+}
+
+__m256d test_mm256_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_f64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcastf64x2
+  return _mm256_maskz_broadcast_f64x2(__M, __A); 
+}
+
+__m128i test_mm_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_broadcast_i32x2(__A); 
+}
+
+__m128i test_mm_mask_broadcast_i32x2(__m128i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m128i test_mm_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i32x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_broadcast_i32x2(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i32x2(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_mask_broadcast_i32x2(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i32x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i32x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti32x2
+  return _mm256_maskz_broadcast_i32x2(__M, __A); 
+}
+
+__m256i test_mm256_broadcast_i64x2(__m128i __A) {
+  // CHECK-LABEL: @test_mm256_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_broadcast_i64x2(__A); 
+}
+
+__m256i test_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_mask_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_mask_broadcast_i64x2(__O, __M, __A); 
+}
+
+__m256i test_mm256_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_broadcast_i64x2
+  // CHECK: @llvm.x86.avx512.mask.broadcasti64x2
+  return _mm256_maskz_broadcast_i64x2(__M, __A); 
+}
+
+__m128d test_mm256_extractf64x2_pd(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_extractf64x2_pd(__A, 1); 
+}
+
+__m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); 
+}
+
+__m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extractf64x2_pd
+  // CHECK: @llvm.x86.avx512.mask.vextractf64x2
+  return _mm256_maskz_extractf64x2_pd(__U, __A, 1); 
+}
+
+__m128i test_mm256_extracti64x2_epi64(__m256i __A) {
+  // CHECK-LABEL: @test_mm256_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_extracti64x2_epi64(__A, 1); 
+}
+
+__m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_mask_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); 
+}
+
+__m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) {
+  // CHECK-LABEL: @test_mm256_maskz_extracti64x2_epi64
+  // CHECK: @llvm.x86.avx512.mask.vextracti64x2
+  return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); 
+}
+
+__m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_insertf64x2(__A, __B, 1); 
+}
+
+__m256d test_mm256_mask_insertf64x2(__m256d __W, __mmask8 __U, __m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_mask_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_mask_insertf64x2(__W, __U, __A, __B, 1); 
+}
+
+__m256d test_mm256_maskz_insertf64x2(__mmask8 __U, __m256d __A, __m128d __B) {
+  // CHECK-LABEL: @test_mm256_maskz_insertf64x2
+  // CHECK: @llvm.x86.avx512.mask.insertf64x2
+  return _mm256_maskz_insertf64x2(__U, __A, __B, 1); 
+}
+
+__m256i test_mm256_inserti64x2(__m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_inserti64x2(__A, __B, 1); 
+}
+
+__m256i test_mm256_mask_inserti64x2(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_mask_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_mask_inserti64x2(__W, __U, __A, __B, 1); 
+}
+
+__m256i test_mm256_maskz_inserti64x2(__mmask8 __U, __m256i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm256_maskz_inserti64x2
+  // CHECK: @llvm.x86.avx512.mask.inserti64x2
+  return _mm256_maskz_inserti64x2(__U, __A, __B, 1); 
+}
+
+__mmask8 test_mm_mask_fpclass_pd_mask(__mmask8 __U, __m128d __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  return _mm_mask_fpclass_pd_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm_fpclass_pd_mask(__m128d __A) {
+  // CHECK-LABEL: @test_mm_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.128
+  return _mm_fpclass_pd_mask(__A, 2); 
+}
+
+__mmask8 test_mm256_mask_fpclass_pd_mask(__mmask8 __U, __m256d __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  return _mm256_mask_fpclass_pd_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm256_fpclass_pd_mask(__m256d __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_pd_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.pd.256
+  return _mm256_fpclass_pd_mask(__A, 2); 
+}
+
+__mmask8 test_mm_mask_fpclass_ps_mask(__mmask8 __U, __m128 __A) {
+  // CHECK-LABEL: @test_mm_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  return _mm_mask_fpclass_ps_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm_fpclass_ps_mask(__m128 __A) {
+  // CHECK-LABEL: @test_mm_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.128
+  return _mm_fpclass_ps_mask(__A, 2); 
+}
+
+__mmask8 test_mm256_mask_fpclass_ps_mask(__mmask8 __U, __m256 __A) {
+  // CHECK-LABEL: @test_mm256_mask_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  return _mm256_mask_fpclass_ps_mask(__U, __A, 2); 
+}
+
+__mmask8 test_mm256_fpclass_ps_mask(__m256 __A) {
+  // CHECK-LABEL: @test_mm256_fpclass_ps_mask
+  // CHECK: @llvm.x86.avx512.mask.fpclass.ps.256
+  return _mm256_fpclass_ps_mask(__A, 2); 
+}
diff --git a/test/CodeGen/backend-unsupported-error.ll b/test/CodeGen/backend-unsupported-error.ll
new file mode 100644
index 0000000000000..1a15bfc74dfb7
--- /dev/null
+++ b/test/CodeGen/backend-unsupported-error.ll
@@ -0,0 +1,44 @@
+; RUN: not %clang_cc1 -triple r600-unknown-unknown -S -o - %s 2>&1 | FileCheck %s
+; REQUIRES: amdgpu-registered-target
+
+; This is to check that backend errors for unsupported features are formatted correctly
+
+; CHECK: error: test.c:2:20: in function bar i32 (): unsupported call to function foo.2
+
+target triple = "r600-unknown-unknown"
+
+; Function Attrs: nounwind uwtable
+define i32 @bar() #0 !dbg !4 {
+entry:
+  %call = call i32 @foo(), !dbg !12
+  ret i32 %call, !dbg !13
+}
+
+; Function Attrs: nounwind uwtable
+define i32 @foo() #0 !dbg !8 {
+entry:
+  %call = call i32 @bar(), !dbg !14
+  ret i32 %call, !dbg !15
+}
+
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!9, !10}
+!llvm.ident = !{!11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 2, type: !5, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{!"clang version 3.9.0"}
+!12 = !DILocation(line: 2, column: 20, scope: !4)
+!13 = !DILocation(line: 2, column: 13, scope: !4)
+!14 = !DILocation(line: 3, column: 20, scope: !8)
+!15 = !DILocation(line: 3, column: 13, scope: !8)
diff --git a/test/CodeGen/bitscan-builtins.c b/test/CodeGen/bitscan-builtins.c
new file mode 100644
index 0000000000000..ae817e8157491
--- /dev/null
+++ b/test/CodeGen/bitscan-builtins.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+#include <immintrin.h>
+
+int test_bit_scan_forward(int a) {
+  return _bit_scan_forward(a);
+// CHECK: @test_bit_scan_forward
+// CHECK: %[[call:.*]] = call i32 @llvm.cttz.i32(
+// CHECK: ret i32 %[[call]]
+}
+
+int test_bit_scan_reverse(int a) {
+  return _bit_scan_reverse(a);
+// CHECK:  %[[call:.*]] = call i32 @llvm.ctlz.i32(
+// CHECK:  %[[sub:.*]] = sub nsw i32 31, %[[call]]
+// CHECK: ret i32 %[[sub]]
+}
diff --git a/test/CodeGen/blocks-opencl.cl b/test/CodeGen/blocks-opencl.cl
index d3562988b738a..61c479b7b98ec 100644
--- a/test/CodeGen/blocks-opencl.cl
+++ b/test/CodeGen/blocks-opencl.cl
@@ -2,15 +2,16 @@
 // This used to crash due to trying to generate a bitcase from a cstring
 // in the constant address space to i8* in AS0.
 
-void dummy(float (^op)(float))
-{
+void dummy(float (^const op)(float)) {
 }
 
 // CHECK: i8 addrspace(3)* getelementptr inbounds ([9 x i8], [9 x i8] addrspace(3)* @.str, i32 0, i32 0)
 
 kernel void test_block()
 {
-  float (^X)(float) = ^(float x) { return x + 42.0f; };
+  float (^const X)(float) = ^(float x) {
+    return x + 42.0f;
+  };
   dummy(X);
 }
 
diff --git a/test/CodeGen/blocks-windows.c b/test/CodeGen/blocks-windows.c
new file mode 100644
index 0000000000000..ced00ef015fce
--- /dev/null
+++ b/test/CodeGen/blocks-windows.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple i686-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DECL -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DECL
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_IN_BLOCKS_DEFN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-IN-BLOCKS-DEFN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLOCKS_NOT_IN_BLOCKS_EXTERN -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple x86_64-windows -fblocks -fdeclspec -DBLCOKS_NOT_IN_BLOCKS_DLLIMPORT -Os -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT
+
+void *_Block_copy(void *);
+
+#if defined(BLOCKS_IN_BLOCKS_DECL)
+extern __declspec(dllexport) long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_IN_BLOCKS_DEFN)
+__declspec(dllexport) long _NSConcreteStackBlock[5];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_EXTERN)
+extern long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_EXTERN_DLLIMPORT)
+extern __declspec(dllimport) long _NSConcreteStackBlock[];
+#endif
+
+#if defined(BLOCKS_NOT_IN_BLOCKS_DLLIMPORT)
+__declspec(dllimport) long _NSConcreteStackBlock[];
+#endif
+
+int (*g(void))(void) {
+  __block int i;
+  return _Block_copy(^{ ++i; return i; });
+}
+
+// CHECK-BLOCKS-IN-BLOCKS-DECL: @_NSConcreteStackBlock = external dllexport global i8*
+// CHECK-BLOCKS-IN-BLOCKS-DEFN: @_NSConcreteStackBlock = common dllexport global [5 x i32]
+// CHECK-BLOCKS-NOT-IN-BLOCKS: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-EXTERN-DLLIMPORT: @_NSConcreteStackBlock = external dllimport global i8*
+// CHECK-BLOCKS-NOT-IN-BLOCKS-DLLIMPORT: @_NSConcreteStackBlock = external dllimport global i8*
+
diff --git a/test/CodeGen/bmi-builtins.c b/test/CodeGen/bmi-builtins.c
index 92332e3a1265d..b9e22f9b20f50 100644
--- a/test/CodeGen/bmi-builtins.c
+++ b/test/CodeGen/bmi-builtins.c
@@ -1,164 +1,223 @@
-// RUN: %clang_cc1 %s -O3 -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +bmi -emit-llvm -o - -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
-// The double underscore intrinsics are for compatibility with 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/bmi-intrinsics-fast-isel.ll
+
+// The double underscore intrinsics are for compatibility with
 // AMD's BMI interface. The single underscore intrinsics
 // are for compatibility with Intel's BMI interface.
 // Apart from the underscores, the interfaces are identical
-// except in one case: although the 'bextr' register-form 
-// instruction is identical in hardware, the AMD and Intel 
-// intrinsics are different! 
+// except in one case: although the 'bextr' register-form
+// instruction is identical in hardware, the AMD and Intel
+// intrinsics are different!
 
 unsigned short test__tzcnt_u16(unsigned short __X) {
-  // CHECK: @llvm.cttz.i16
+  // CHECK-LABEL: test__tzcnt_u16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i16 @llvm.cttz.i16(i16 %{{.*}}, i1 true)
   return __tzcnt_u16(__X);
 }
 
 unsigned int test__andn_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test__andn_u32
+  // CHECK: xor i32 %{{.*}}, -1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __andn_u32(__X, __Y);
 }
 
 unsigned int test__bextr_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: @llvm.x86.bmi.bextr.32
+  // CHECK-LABEL: test__bextr_u32
+  // CHECK: i32 @llvm.x86.bmi.bextr.32(i32 %{{.*}}, i32 %{{.*}})
   return __bextr_u32(__X, __Y);
 }
 
 unsigned int test__blsi_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test__blsi_u32
+  // CHECK: sub i32 0, %{{.*}}
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __blsi_u32(__X);
 }
 
 unsigned int test__blsmsk_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsmsk_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: xor i32 %{{.*}}, %{{.*}}
   return __blsmsk_u32(__X);
 }
 
 unsigned int test__blsr_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsr_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return __blsr_u32(__X);
 }
 
 unsigned int test__tzcnt_u32(unsigned int __X) {
-  // CHECK: @llvm.cttz.i32
+  // CHECK-LABEL: test__tzcnt_u32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
   return __tzcnt_u32(__X);
 }
 
+int test_mm_tzcnt_32(unsigned int __X) {
+  // CHECK-LABEL: test_mm_tzcnt_32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
+  return _mm_tzcnt_32(__X);
+}
+
 unsigned long long test__andn_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test__andn_u64
+  // CHECK: xor i64 %{{.*}}, -1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __andn_u64(__X, __Y);
 }
 
 unsigned long long test__bextr_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: @llvm.x86.bmi.bextr.64
+  // CHECK-LABEL: test__bextr_u64
+  // CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return __bextr_u64(__X, __Y);
 }
 
 unsigned long long test__blsi_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test__blsi_u64
+  // CHECK: sub i64 0, %{{.*}}
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __blsi_u64(__X);
 }
 
 unsigned long long test__blsmsk_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsmsk_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: xor i64 %{{.*}}, %{{.*}}
   return __blsmsk_u64(__X);
 }
 
 unsigned long long test__blsr_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test__blsr_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return __blsr_u64(__X);
 }
 
 unsigned long long test__tzcnt_u64(unsigned long long __X) {
-  // CHECK: @llvm.cttz.i64
+  // CHECK-LABEL: test__tzcnt_u64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
   return __tzcnt_u64(__X);
 }
 
+long long test_mm_tzcnt_64(unsigned long long __X) {
+  // CHECK-LABEL: test_mm_tzcnt_64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
+  return _mm_tzcnt_64(__X);
+}
+
 // Intel intrinsics
 
 unsigned short test_tzcnt_u16(unsigned short __X) {
-  // CHECK: @llvm.cttz.i16
+  // CHECK-LABEL: test_tzcnt_u16
+  // CHECK: zext i16 %{{.*}} to i32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i16 @llvm.cttz.i16(i16 %{{.*}}, i1 true)
   return _tzcnt_u16(__X);
 }
 
 unsigned int test_andn_u32(unsigned int __X, unsigned int __Y) {
-  // CHECK: [[DEST:%.*]] = xor i32 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i32 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test_andn_u32
+  // CHECK: xor i32 %{{.*}}, -1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _andn_u32(__X, __Y);
 }
 
-unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y, 
+unsigned int test_bextr_u32(unsigned int __X, unsigned int __Y,
                             unsigned int __Z) {
-  // CHECK: @llvm.x86.bmi.bextr.32
+  // CHECK-LABEL: test_bextr_u32
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: i32 @llvm.x86.bmi.bextr.32(i32 %{{.*}}, i32 %{{.*}})
   return _bextr_u32(__X, __Y, __Z);
 }
 
 unsigned int test_blsi_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = sub i32 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i32 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test_blsi_u32
+  // CHECK: sub i32 0, %{{.*}}
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _blsi_u32(__X);
 }
 
 unsigned int test_blsmsk_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsmsk_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: xor i32 %{{.*}}, %{{.*}}
   return _blsmsk_u32(__X);
 }
 
 unsigned int test_blsr_u32(unsigned int __X) {
-  // CHECK: [[DEST:%.*]] = add i32 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i32 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsr_u32
+  // CHECK: sub i32 %{{.*}}, 1
+  // CHECK: and i32 %{{.*}}, %{{.*}}
   return _blsr_u32(__X);
 }
 
 unsigned int test_tzcnt_u32(unsigned int __X) {
-  // CHECK: @llvm.cttz.i32
+  // CHECK-LABEL: test_tzcnt_u32
+  // CHECK: icmp ne i32 %{{.*}}, 0
+  // CHECK: i32 @llvm.cttz.i32(i32 %{{.*}}, i1 true)
   return _tzcnt_u32(__X);
 }
 
 unsigned long long test_andn_u64(unsigned long __X, unsigned long __Y) {
-  // CHECK: [[DEST:%.*]] = xor i64 %{{.*}}, -1
-  // CHECK-NEXT: %{{.*}} = and i64 %{{.*}}, [[DEST]]
+  // CHECK-LABEL: test_andn_u64
+  // CHECK: xor i64 %{{.*}}, -1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _andn_u64(__X, __Y);
 }
 
-unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y, 
+unsigned long long test_bextr_u64(unsigned long __X, unsigned int __Y,
                                   unsigned int __Z) {
-  // CHECK: @llvm.x86.bmi.bextr.64
+  // CHECK-LABEL: test_bextr_u64
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: and i32 %{{.*}}, 255
+  // CHECK: shl i32 %{{.*}}, 8
+  // CHECK: or i32 %{{.*}}, %{{.*}}
+  // CHECK: zext i32 %{{.*}} to i64
+  // CHECK: i64 @llvm.x86.bmi.bextr.64(i64 %{{.*}}, i64 %{{.*}})
   return _bextr_u64(__X, __Y, __Z);
 }
 
 unsigned long long test_blsi_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = sub i64 0, [[SRC:%.*]]
-  // CHECK-NEXT: %{{.*}} = and i64 [[SRC]], [[DEST]]
+  // CHECK-LABEL: test_blsi_u64
+  // CHECK: sub i64 0, %{{.*}}
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _blsi_u64(__X);
 }
 
 unsigned long long test_blsmsk_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = xor i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsmsk_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: xor i64 %{{.*}}, %{{.*}}
   return _blsmsk_u64(__X);
 }
 
 unsigned long long test_blsr_u64(unsigned long long __X) {
-  // CHECK: [[DEST:%.*]] = add i64 [[SRC:%.*]], -1
-  // CHECK-NEXT: %{{.*}} = and i64 [[DEST]], [[SRC]]
+  // CHECK-LABEL: test_blsr_u64
+  // CHECK: sub i64 %{{.*}}, 1
+  // CHECK: and i64 %{{.*}}, %{{.*}}
   return _blsr_u64(__X);
 }
 
 unsigned long long test_tzcnt_u64(unsigned long long __X) {
-  // CHECK: @llvm.cttz.i64
+  // CHECK-LABEL: test_tzcnt_u64
+  // CHECK: icmp ne i64 %{{.*}}, 0
+  // CHECK: i64 @llvm.cttz.i64(i64 %{{.*}}, i1 true)
   return _tzcnt_u64(__X);
 }
diff --git a/test/CodeGen/builtin-clflushopt.c b/test/CodeGen/builtin-clflushopt.c
new file mode 100644
index 0000000000000..7248d378a4e13
--- /dev/null
+++ b/test/CodeGen/builtin-clflushopt.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +clflushopt  -emit-llvm -o - -Werror | FileCheck %s
+#define __MM_MALLOC_H
+
+#include <immintrin.h>
+void test_mm_clflushopt(char * __m) {
+  //CHECK-LABLE: @test_mm_clflushopt
+  //CHECK: @llvm.x86.clflushopt
+  _mm_clflushopt(__m);
+}
diff --git a/test/CodeGen/builtin-expect.c b/test/CodeGen/builtin-expect.c
index 884110cbe905e..560625ed24392 100644
--- a/test/CodeGen/builtin-expect.c
+++ b/test/CodeGen/builtin-expect.c
@@ -1,45 +1,69 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=CHECK_O0
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O1 -disable-llvm-optzns | FileCheck %s --check-prefix=ALL --check-prefix=O1
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -O0 | FileCheck %s --check-prefix=ALL --check-prefix=O0
+
+// In all tests, make sure that no expect is generated if optimizations are off.
+// If optimizations are on, generate the correct expect and preserve other necessary operations.
+
+int expect_taken(int x) {
+// ALL-LABEL: define i32 @expect_taken
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 1)
+// O0-NOT:    @llvm.expect
+
+  if (__builtin_expect (x, 1))
+    return 0;
+  return x;
+}
+
+
+int expect_not_taken(int x) {
+// ALL-LABEL: define i32 @expect_not_taken
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 0)
+// O0-NOT:    @llvm.expect
+
+  if (__builtin_expect (x, 0))
+    return 0;
+  return x;
+}
+
 
 int x;
 int y(void);
 void foo();
-void FUNC() {
-// CHECK-LABEL: define void @FUNC()
-// CHECK: [[call:%.*]] = call i32 @y
-// CHECK_O0: [[call:%.*]] = call i32 @y
-// CHECK_O0-NOT: call i64 @llvm.expect
+
+void expect_value_side_effects() {
+// ALL-LABEL: define void @expect_value_side_effects()
+// ALL:       [[CALL:%.*]] = call i32 @y
+// O1:        [[SEXT:%.*]] = sext i32 [[CALL]] to i64
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 [[SEXT]])
+// O0-NOT:    @llvm.expect
+
   if (__builtin_expect (x, y()))
     foo ();
 }
 
+
+// Make sure that issigprocmask() is called before bar()?
+// There's no compare, so there's nothing to expect?
 // rdar://9330105
 void isigprocmask(void);
 long bar();
 
 int main() {
-    (void) __builtin_expect((isigprocmask(), 0), bar());
-}
+// ALL-LABEL: define i32 @main()
+// ALL:       call void @isigprocmask()
+// ALL:       [[CALL:%.*]] = call i64 (...) @bar()
+// O1:        call i64 @llvm.expect.i64(i64 0, i64 [[CALL]])
+// O0-NOT:    @llvm.expect
 
-// CHECK-LABEL: define i32 @main()
-// CHECK: call void @isigprocmask()
-// CHECK: [[C:%.*]] = call i64 (...) @bar()
-// CHECK_O0: call void @isigprocmask()
-// CHECK_O0: [[C:%.*]] = call i64 (...) @bar()
-// CHECK_O0-NOT: call i64 @llvm.expect
+  (void) __builtin_expect((isigprocmask(), 0), bar());
+}
 
 
-// CHECK-LABEL: define i32 @test1
-int test1(int x) {
-// CHECK_O0-NOT: call i64 @llvm.expect
-  if (__builtin_expect (x, 1))
-    return 0;
-  return x;
-}
+int switch_cond(int x) {
+// ALL-LABEL: define i32 @switch_cond
+// O1:        call i64 @llvm.expect.i64(i64 {{%.*}}, i64 5)
+// O0-NOT:    @llvm.expect
 
-// CHECK: define i32 @test2
-int test2(int x) {
-// CHECK_O0-NOT: call i64 @llvm.expect
   switch(__builtin_expect(x, 5)) {
   default:
     return 0;
@@ -53,3 +77,4 @@ int test2(int x) {
 
   return 0;
 }
+
diff --git a/test/CodeGen/builtins-arm-exclusive.c b/test/CodeGen/builtins-arm-exclusive.c
index 2b10238c0f448..b0bc2b82d316b 100644
--- a/test/CodeGen/builtins-arm-exclusive.c
+++ b/test/CodeGen/builtins-arm-exclusive.c
@@ -1,32 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -O3 -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -O3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARM64
-
-// Make sure the canonical use works before going into smaller details:
-int atomic_inc(int *addr) {
-  int Failure, OldVal;
-  do {
-    OldVal = __builtin_arm_ldrex(addr);
-    Failure = __builtin_arm_strex(OldVal + 1, addr);
-  } while (Failure);
-
-  return OldVal;
-}
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv8-linux-gnueabi -fno-signed-char -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple arm64-apple-ios7.0 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s --check-prefix=CHECK-ARM64
 
-// CHECK-LABEL: @atomic_inc
-// CHECK:   [[OLDVAL:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* %addr)
-// CHECK:   [[INC:%.*]] = add nsw i32 [[OLDVAL]], 1
-// CHECK:   [[FAILURE:%.*]] = tail call i32 @llvm.arm.strex.p0i32(i32 [[INC]], i32* %addr)
-// CHECK:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
-
-// CHECK-ARM64-LABEL: @atomic_inc
-// CHECK-ARM64:   [[OLDVAL:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* %addr)
-// CHECK-ARM64:   [[INC:%.*]] = add i64 [[OLDVAL]], 1
-// CHECK-ARM64:   [[TRUNC:%.*]] = and i64 [[INC]], 4294967295
-// CHECK-ARM64:   [[FAILURE:%.*]] = tail call i32 @llvm.aarch64.stxr.p0i32(i64 [[TRUNC]], i32* %addr)
-// CHECK-ARM64:   [[TST:%.*]] = icmp eq i32 [[FAILURE]], 0
-// CHECK-ARM64:   br i1 [[TST]], label {{%[a-zA-Z0-9.]+}}, label {{%[a-zA-Z0-9.]+}}
 
 struct Simple {
   char a, b;
@@ -37,36 +11,33 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK-ARM64-LABEL: @test_ldrex
   int sum = 0;
   sum += __builtin_arm_ldrex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldrex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldrex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK:  call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldrex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
@@ -79,16 +50,18 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 
   sum += __builtin_arm_ldrex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldrex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldrexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldrexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -97,21 +70,31 @@ int test_ldrex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldrex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldrex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldrex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldrex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -121,36 +104,33 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK-ARM64-LABEL: @test_ldaex
   int sum = 0;
   sum += __builtin_arm_ldaex(addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
-// CHECK: and i32 [[INTRES]], 255
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i8(i8* %addr)
+// CHECK: trunc i32 [[INTRES]] to i8
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[SEXTTMP:%.*]] = shl i32 [[TRUNCRES]], 24
-// CHECK-ARM64: ashr exact i32 [[SEXTTMP]], 24
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i8(i8* %addr)
+// CHECK-ARM64: trunc i64 [[INTRES]] to i8
 
   sum += __builtin_arm_ldaex((short *)addr);
 // CHECK: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
-// CHECK: [[TMPSEXT:%.*]] = shl i32 [[INTRES]], 16
-// CHECK: ashr exact i32 [[TMPSEXT]], 16
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i16(i16* [[ADDR16]])
+// CHECK: trunc i32 [[INTRES]] to i16
 
 // CHECK-ARM64: [[ADDR16:%.*]] = bitcast i8* %addr to i16*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
-// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
-// CHECK-ARM64: [[TMPSEXT:%.*]] = shl i32 [[TRUNCRES]], 16
-// CHECK-ARM64: ashr exact i32 [[TMPSEXT]], 16
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i16(i16* [[ADDR16]])
+// CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i16
 
   sum += __builtin_arm_ldaex((int *)addr);
 // CHECK: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
 // CHECK:  call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
 
 // CHECK-ARM64: [[ADDR32:%.*]] = bitcast i8* %addr to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[ADDR32]])
 // CHECK-ARM64: trunc i64 [[INTRES]] to i32
 
   sum += __builtin_arm_ldaex((long long *)addr);
-// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
@@ -163,16 +143,18 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {
 
   sum += __builtin_arm_ldaex(addrfloat);
 // CHECK: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[INTADDR]])
 // CHECK: bitcast i32 [[INTRES]] to float
 
 // CHECK-ARM64: [[INTADDR:%.*]] = bitcast float* %addrfloat to i32*
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i32(i32* [[INTADDR]])
 // CHECK-ARM64: [[TRUNCRES:%.*]] = trunc i64 [[INTRES]] to i32
 // CHECK-ARM64: bitcast i32 [[TRUNCRES]] to float
 
   sum += __builtin_arm_ldaex((double *)addr);
-// CHECK: [[STRUCTRES:%.*]] = tail call { i32, i32 } @llvm.arm.ldaexd(i8* %addr)
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: [[STRUCTRES:%.*]] = call { i32, i32 } @llvm.arm.ldaexd(i8* [[TMP5]])
 // CHECK: [[RESHI:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 1
 // CHECK: [[RESLO:%.*]] = extractvalue { i32, i32 } [[STRUCTRES]], 0
 // CHECK: [[RESHI64:%.*]] = zext i32 [[RESHI]] to i64
@@ -181,21 +163,31 @@ int test_ldaex(char *addr, long long *addr64, float *addrfloat) {
 // CHECK: [[INTRES:%.*]] = or i64 [[RESHIHI]], [[RESLO64]]
 // CHECK: bitcast i64 [[INTRES]] to double
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: bitcast i64 [[INTRES]] to double
 
   sum += *__builtin_arm_ldaex((int **)addr);
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to i32*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to i32**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast i32** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to i32*
 
   sum += __builtin_arm_ldaex((struct Simple **)addr)->a;
-// CHECK: [[INTRES:%.*]] = tail call i32 @llvm.arm.ldaex.p0i32(i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTRES:%.*]] = call i32 @llvm.arm.ldaex.p0i32(i32* [[TMP5]])
 // CHECK: inttoptr i32 [[INTRES]] to %struct.Simple*
 
-// CHECK-ARM64: [[INTRES:%.*]] = tail call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTRES:%.*]] = call i64 @llvm.aarch64.ldaxr.p0i64(i64* [[TMP5]])
 // CHECK-ARM64: inttoptr i64 [[INTRES]] to %struct.Simple*
   return sum;
 }
@@ -225,27 +217,51 @@ int test_strex(char *addr) {
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_strex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_strex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_strex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.strexd(i32 -266631570, i32 1074340345, i8* %addr)
-
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.strexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
+
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_strex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.strex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -275,27 +291,51 @@ int test_stlex(char *addr) {
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 42, i32* [[ADDR32]])
 
   res |= __builtin_arm_stlex(42, (long long *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 42, i32 0, i8* %addr)
+// CHECK: store i64 42, i64* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast i64* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to i64*
+// CHECK: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
 
 // CHECK-ARM64: [[ADDR64:%.*]] = bitcast i8* %addr to i64*
 // CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 42, i64* [[ADDR64]])
 
   res |= __builtin_arm_stlex(2.71828f, (float *)addr);
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 1076754509, i32* [[TMP5]])
 
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[ADDR32]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to float*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast float* [[TMP4]] to i32*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i32(i64 1076754509, i32* [[TMP5]])
 
   res |= __builtin_arm_stlex(3.14159, (double *)addr);
-// CHECK: call i32 @llvm.arm.stlexd(i32 -266631570, i32 1074340345, i8* %addr)
-
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[ADDR64]])
+// CHECK: store double 3.141590e+00, double* [[TMP:%.*]], align 8
+// CHECK: [[LOHI_ADDR:%.*]] = bitcast double* [[TMP]] to { i32, i32 }*
+// CHECK: [[LOHI:%.*]] = load { i32, i32 }, { i32, i32 }* [[LOHI_ADDR]]
+// CHECK: [[LO:%.*]] = extractvalue { i32, i32 } [[LOHI]], 0
+// CHECK: [[HI:%.*]] = extractvalue { i32, i32 } [[LOHI]], 1
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i8*
+// CHECK: call i32 @llvm.arm.stlexd(i32 [[LO]], i32 [[HI]], i8* [[TMP5]])
+
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to double*
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast double* [[TMP4]] to i64*
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 4614256650576692846, i64* [[TMP5]])
 
   res |= __builtin_arm_stlex(&var, (struct Simple **)addr);
-// CHECK: [[INTVAL:%.*]] = ptrtoint i16* %var to i32
-// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[ADDR32]])
+// CHECK: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i32*
+// CHECK: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i32
+// CHECK: call i32 @llvm.arm.stlex.p0i32(i32 [[INTVAL]], i32* [[TMP5]])
 
-// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint i16* %var to i64
-// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[ADDR64]])
+// CHECK-ARM64: [[TMP4:%.*]] = bitcast i8* %addr to %struct.Simple**
+// CHECK-ARM64: [[TMP5:%.*]] = bitcast %struct.Simple** [[TMP4]] to i64*
+// CHECK-ARM64: [[INTVAL:%.*]] = ptrtoint %struct.Simple* %var to i64
+// CHECK-ARM64: call i32 @llvm.aarch64.stlxr.p0i64(i64 [[INTVAL]], i64* [[TMP5]])
 
   return res;
 }
@@ -317,7 +357,7 @@ __int128 test_ldrex_128(__int128 *addr) {
 
   return __builtin_arm_ldrex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -331,11 +371,13 @@ int test_strex_128(__int128 *addr, __int128 val) {
 // CHECK-ARM64-LABEL: @test_strex_128
 
   return __builtin_arm_strex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: call i32 @llvm.aarch64.stxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 __int128 test_ldaex_128(__int128 *addr) {
@@ -343,7 +385,7 @@ __int128 test_ldaex_128(__int128 *addr) {
 
   return __builtin_arm_ldaex(addr);
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[STRUCTRES:%.*]] = tail call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
+// CHECK-ARM64: [[STRUCTRES:%.*]] = call { i64, i64 } @llvm.aarch64.ldaxp(i8* [[ADDR8]])
 // CHECK-ARM64: [[RESHI:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 1
 // CHECK-ARM64: [[RESLO:%.*]] = extractvalue { i64, i64 } [[STRUCTRES]], 0
 // CHECK-ARM64: [[RESHI64:%.*]] = zext i64 [[RESHI]] to i128
@@ -357,11 +399,13 @@ int test_stlex_128(__int128 *addr, __int128 val) {
 // CHECK-ARM64-LABEL: @test_stlex_128
 
   return __builtin_arm_stlex(val, addr);
-// CHECK-ARM64: [[VALLO:%.*]] = trunc i128 %val to i64
-// CHECK-ARM64: [[VALHI128:%.*]] = lshr i128 %val, 64
-// CHECK-ARM64: [[VALHI:%.*]] = trunc i128 [[VALHI128]] to i64
+// CHECK-ARM64: store i128 %val, i128* [[TMP:%.*]], align 16
+// CHECK-ARM64: [[LOHI_ADDR:%.*]] = bitcast i128* [[TMP]] to { i64, i64 }*
+// CHECK-ARM64: [[LOHI:%.*]] = load { i64, i64 }, { i64, i64 }* [[LOHI_ADDR]]
+// CHECK-ARM64: [[LO:%.*]] = extractvalue { i64, i64 } [[LOHI]], 0
+// CHECK-ARM64: [[HI:%.*]] = extractvalue { i64, i64 } [[LOHI]], 1
 // CHECK-ARM64: [[ADDR8:%.*]] = bitcast i128* %addr to i8*
-// CHECK-ARM64: [[RES:%.*]] = tail call i32 @llvm.aarch64.stlxp(i64 [[VALLO]], i64 [[VALHI]], i8* [[ADDR8]])
+// CHECK-ARM64: [[RES:%.*]] = call i32 @llvm.aarch64.stlxp(i64 [[LO]], i64 [[HI]], i8* [[ADDR8]])
 }
 
 #endif
diff --git a/test/CodeGen/builtins-arm.c b/test/CodeGen/builtins-arm.c
index 4cec84c33728d..a385bd27546a4 100644
--- a/test/CodeGen/builtins-arm.c
+++ b/test/CodeGen/builtins-arm.c
@@ -1,5 +1,6 @@
-// REQUIRES: arm-registered-target
-// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -Wall -Werror -triple thumbv7-eabi -target-cpu cortex-a8 -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
+
+#include <stdint.h>
 
 void *f0()
 {
@@ -85,16 +86,86 @@ void prefetch(int i) {
 // CHECK: call {{.*}} @llvm.prefetch(i8* %{{.*}}, i32 1, i32 3, i32 0)
 }
 
+void ldc(const void *i) {
+  // CHECK: define void @ldc(i8* %i)
+  // CHECK: call void @llvm.arm.ldc(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc(1, 2, i);
+}
+
+void ldcl(const void *i) {
+  // CHECK: define void @ldcl(i8* %i)
+  // CHECK: call void @llvm.arm.ldcl(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldcl(1, 2, i);
+}
+
+void ldc2(const void *i) {
+  // CHECK: define void @ldc2(i8* %i)
+  // CHECK: call void @llvm.arm.ldc2(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc2(1, 2, i);
+}
+
+void ldc2l(const void *i) {
+  // CHECK: define void @ldc2l(i8* %i)
+  // CHECK: call void @llvm.arm.ldc2l(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_ldc2l(1, 2, i);
+}
+
+void stc(void *i) {
+  // CHECK: define void @stc(i8* %i)
+  // CHECK: call void @llvm.arm.stc(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc(1, 2, i);
+}
+
+void stcl(void *i) {
+  // CHECK: define void @stcl(i8* %i)
+  // CHECK: call void @llvm.arm.stcl(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stcl(1, 2, i);
+}
+
+void stc2(void *i) {
+  // CHECK: define void @stc2(i8* %i)
+  // CHECK: call void @llvm.arm.stc2(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc2(1, 2, i);
+}
+
+void stc2l(void *i) {
+  // CHECK: define void @stc2l(i8* %i)
+  // CHECK: call void @llvm.arm.stc2l(i32 1, i32 2, i8* %i)
+  // CHECK-NEXT: ret void
+  __builtin_arm_stc2l(1, 2, i);
+}
+
+void cdp() {
+  // CHECK: define void @cdp()
+  // CHECK: call void @llvm.arm.cdp(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+  // CHECK-NEXT: ret void
+  __builtin_arm_cdp(1, 2, 3, 4, 5, 6);
+}
+
+void cdp2() {
+  // CHECK: define void @cdp2()
+  // CHECK: call void @llvm.arm.cdp2(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
+  // CHECK-NEXT: ret void
+  __builtin_arm_cdp2(1, 2, 3, 4, 5, 6);
+}
+
 unsigned mrc() {
   // CHECK: define i32 @mrc()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc(15, 0, 13, 0, 3);
 }
 
 unsigned mrc2() {
   // CHECK: define i32 @mrc2()
-  // CHECK: [[R:%.*]] = {{.*}} call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
+  // CHECK: [[R:%.*]] = call i32 @llvm.arm.mrc2(i32 15, i32 0, i32 13, i32 0, i32 3)
   // CHECK-NEXT: ret i32 [[R]]
   return __builtin_arm_mrc2(15, 0, 13, 0, 3);
 }
@@ -111,53 +182,65 @@ void mcr2(unsigned a) {
   __builtin_arm_mcr2(15, 0, a, 13, 0, 3);
 }
 
-void mcrr(unsigned a, unsigned b) {
-  // CHECK: define void @mcrr(i32 [[A:%.*]], i32 [[B:%.*]])
-  // CHECK: call void @llvm.arm.mcrr(i32 15, i32 0, i32 [[A]], i32 [[B]], i32 0)
-  __builtin_arm_mcrr(15, 0, a, b, 0);
+void mcrr(uint64_t a) {
+  // CHECK: define void @mcrr(i64 %{{.*}})
+  // CHECK: call void @llvm.arm.mcrr(i32 15, i32 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0)
+  __builtin_arm_mcrr(15, 0, a, 0);
+}
+
+void mcrr2(uint64_t a) {
+  // CHECK: define void @mcrr2(i64 %{{.*}})
+  // CHECK: call void @llvm.arm.mcrr2(i32 15, i32 0, i32 %{{[0-9]+}}, i32 %{{[0-9]+}}, i32 0)
+  __builtin_arm_mcrr2(15, 0, a, 0);
+}
+
+uint64_t mrrc() {
+  // CHECK: define i64 @mrrc()
+  // CHECK: call { i32, i32 } @llvm.arm.mrrc(i32 15, i32 0, i32 0)
+  return __builtin_arm_mrrc(15, 0, 0);
 }
 
-void mcrr2(unsigned a, unsigned b) {
-  // CHECK: define void @mcrr2(i32 [[A:%.*]], i32 [[B:%.*]])
-  // CHECK: call void @llvm.arm.mcrr2(i32 15, i32 0, i32 [[A]], i32 [[B]], i32 0)
-  __builtin_arm_mcrr2(15, 0, a, b, 0);
+uint64_t mrrc2() {
+  // CHECK: define i64 @mrrc2()
+  // CHECK: call { i32, i32 } @llvm.arm.mrrc2(i32 15, i32 0, i32 0)
+  return __builtin_arm_mrrc2(15, 0, 0);
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !7)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M0:.*]])
   // CHECK-NEXT: ret i32 [[V0]]
   return __builtin_arm_rsr("cp1:2:c3:c4:5");
 }
 
 unsigned long long rsr64() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata !8)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M1:.*]])
   // CHECK-NEXT: ret i64 [[V0]]
   return __builtin_arm_rsr64("cp1:2:c3");
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i32 @llvm.read_register.i32(metadata !9)
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i32 @llvm.read_register.i32(metadata ![[M2:.*]])
   // CHECK-NEXT: [[V1:[%A-Za-z0-9.]+]] = inttoptr i32 [[V0]] to i8*
   // CHECK-NEXT: ret i8* [[V1]]
   return __builtin_arm_rsrp("sysreg");
 }
 
 void wsr(unsigned v) {
-  // CHECK: call void @llvm.write_register.i32(metadata !7, i32 %v)
+  // CHECK: call void @llvm.write_register.i32(metadata ![[M0]], i32 %v)
   __builtin_arm_wsr("cp1:2:c3:c4:5", v);
 }
 
 void wsr64(unsigned long long v) {
-  // CHECK: call void @llvm.write_register.i64(metadata !8, i64 %v)
+  // CHECK: call void @llvm.write_register.i64(metadata ![[M1]], i64 %v)
   __builtin_arm_wsr64("cp1:2:c3", v);
 }
 
 void wsrp(void *v) {
   // CHECK: [[V0:[%A-Za-z0-9.]+]] = ptrtoint i8* %v to i32
-  // CHECK-NEXT: call void @llvm.write_register.i32(metadata !9, i32 [[V0]])
+  // CHECK-NEXT: call void @llvm.write_register.i32(metadata ![[M2]], i32 [[V0]])
   __builtin_arm_wsrp("sysreg", v);
 }
 
-// CHECK: !7 = !{!"cp1:2:c3:c4:5"}
-// CHECK: !8 = !{!"cp1:2:c3"}
-// CHECK: !9 = !{!"sysreg"}
+// CHECK: ![[M0]] = !{!"cp1:2:c3:c4:5"}
+// CHECK: ![[M1]] = !{!"cp1:2:c3"}
+// CHECK: ![[M2]] = !{!"sysreg"}
diff --git a/test/CodeGen/builtins-arm64.c b/test/CodeGen/builtins-arm64.c
index 16e22d771fcac..20eb2abc94765 100644
--- a/test/CodeGen/builtins-arm64.c
+++ b/test/CodeGen/builtins-arm64.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -O3 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-unknown-linux -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s
 
 void f0(void *a, void *b) {
 	__clear_cache(a,b);
@@ -7,7 +7,7 @@ void f0(void *a, void *b) {
 
 void *tp (void) {
   return __builtin_thread_pointer ();
-// CHECK: call {{.*}} @llvm.aarch64.thread.pointer()
+// CHECK: call {{.*}} @llvm.thread.pointer()
 }
 
 // CHECK: call {{.*}} @llvm.aarch64.rbit.i32(i32 %a)
@@ -50,7 +50,7 @@ void prefetch() {
 }
 
 unsigned rsr() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: trunc i64 [[V0]] to i32
   return __builtin_arm_rsr("1:2:3:4:5");
 }
@@ -61,7 +61,7 @@ unsigned long rsr64() {
 }
 
 void *rsrp() {
-  // CHECK: [[V0:[%A-Za-z0-9.]+]] = {{.*}} call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
+  // CHECK: [[V0:[%A-Za-z0-9.]+]] = call i64 @llvm.read_register.i64(metadata ![[M0:[0-9]]])
   // CHECK-NEXT: inttoptr i64 [[V0]] to i8*
   return __builtin_arm_rsrp("1:2:3:4:5");
 }
diff --git a/test/CodeGen/builtins-hexagon.c b/test/CodeGen/builtins-hexagon.c
new file mode 100644
index 0000000000000..e2eda2afafd5d
--- /dev/null
+++ b/test/CodeGen/builtins-hexagon.c
@@ -0,0 +1,2965 @@
+// REQUIRES: hexagon-registered-target
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -emit-llvm %s -o - | FileCheck %s
+
+void foo() {
+  int v16 __attribute__((__vector_size__(64)));
+  int v32 __attribute__((__vector_size__(128)));
+  int v64 __attribute__((__vector_size__(256)));
+
+  // The circ/brev intrinsics do not have _HEXAGON_ in the name.
+  __builtin_brev_ldb(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldb
+  __builtin_brev_ldd(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldd
+  __builtin_brev_ldh(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldh
+  __builtin_brev_ldub(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldub
+  __builtin_brev_lduh(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.lduh
+  __builtin_brev_ldw(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.ldw
+  __builtin_brev_stb(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.stb
+  __builtin_brev_std(0, 0LL, 0);
+  // CHECK: @llvm.hexagon.brev.std
+  __builtin_brev_sth(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.sth
+  __builtin_brev_sthhi(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.sthhi
+  __builtin_brev_stw(0, 0, 0);
+  // CHECK: @llvm.hexagon.brev.stw
+  __builtin_circ_ldb(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldb
+  __builtin_circ_ldd(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldd
+  __builtin_circ_ldh(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldh
+  __builtin_circ_ldub(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldub
+  __builtin_circ_lduh(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.lduh
+  __builtin_circ_ldw(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.ldw
+  __builtin_circ_stb(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.stb
+  __builtin_circ_std(0, 0LL, 0, 0);
+  // CHECK: llvm.hexagon.circ.std
+  __builtin_circ_sth(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.sth
+  __builtin_circ_sthhi(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.sthhi
+  __builtin_circ_stw(0, 0, 0, 0);
+  // CHECK: llvm.hexagon.circ.stw
+
+  __builtin_HEXAGON_A2_abs(0);
+  // CHECK: @llvm.hexagon.A2.abs
+  __builtin_HEXAGON_A2_absp(0);
+  // CHECK: @llvm.hexagon.A2.absp
+  __builtin_HEXAGON_A2_abssat(0);
+  // CHECK: @llvm.hexagon.A2.abssat
+  __builtin_HEXAGON_A2_add(0, 0);
+  // CHECK: @llvm.hexagon.A2.add
+  __builtin_HEXAGON_A2_addh_h16_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.hh
+  __builtin_HEXAGON_A2_addh_h16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.hl
+  __builtin_HEXAGON_A2_addh_h16_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.lh
+  __builtin_HEXAGON_A2_addh_h16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.ll
+  __builtin_HEXAGON_A2_addh_h16_sat_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.hh
+  __builtin_HEXAGON_A2_addh_h16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.hl
+  __builtin_HEXAGON_A2_addh_h16_sat_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.lh
+  __builtin_HEXAGON_A2_addh_h16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.h16.sat.ll
+  __builtin_HEXAGON_A2_addh_l16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.hl
+  __builtin_HEXAGON_A2_addh_l16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.ll
+  __builtin_HEXAGON_A2_addh_l16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.sat.hl
+  __builtin_HEXAGON_A2_addh_l16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.addh.l16.sat.ll
+  __builtin_HEXAGON_A2_addi(0, 0);
+  // CHECK: @llvm.hexagon.A2.addi
+  __builtin_HEXAGON_A2_addp(0, 0);
+  // CHECK: @llvm.hexagon.A2.addp
+  __builtin_HEXAGON_A2_addpsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.addpsat
+  __builtin_HEXAGON_A2_addsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.addsat
+  __builtin_HEXAGON_A2_addsp(0, 0);
+  // CHECK: @llvm.hexagon.A2.addsp
+  __builtin_HEXAGON_A2_and(0, 0);
+  // CHECK: @llvm.hexagon.A2.and
+  __builtin_HEXAGON_A2_andir(0, 0);
+  // CHECK: @llvm.hexagon.A2.andir
+  __builtin_HEXAGON_A2_andp(0, 0);
+  // CHECK: @llvm.hexagon.A2.andp
+  __builtin_HEXAGON_A2_aslh(0);
+  // CHECK: @llvm.hexagon.A2.aslh
+  __builtin_HEXAGON_A2_asrh(0);
+  // CHECK: @llvm.hexagon.A2.asrh
+  __builtin_HEXAGON_A2_combine_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.hh
+  __builtin_HEXAGON_A2_combine_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.hl
+  __builtin_HEXAGON_A2_combineii(0, 0);
+  // CHECK: @llvm.hexagon.A2.combineii
+  __builtin_HEXAGON_A2_combine_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.lh
+  __builtin_HEXAGON_A2_combine_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.combine.ll
+  __builtin_HEXAGON_A2_combinew(0, 0);
+  // CHECK: @llvm.hexagon.A2.combinew
+  __builtin_HEXAGON_A2_max(0, 0);
+  // CHECK: @llvm.hexagon.A2.max
+  __builtin_HEXAGON_A2_maxp(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxp
+  __builtin_HEXAGON_A2_maxu(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxu
+  __builtin_HEXAGON_A2_maxup(0, 0);
+  // CHECK: @llvm.hexagon.A2.maxup
+  __builtin_HEXAGON_A2_min(0, 0);
+  // CHECK: @llvm.hexagon.A2.min
+  __builtin_HEXAGON_A2_minp(0, 0);
+  // CHECK: @llvm.hexagon.A2.minp
+  __builtin_HEXAGON_A2_minu(0, 0);
+  // CHECK: @llvm.hexagon.A2.minu
+  __builtin_HEXAGON_A2_minup(0, 0);
+  // CHECK: @llvm.hexagon.A2.minup
+  __builtin_HEXAGON_A2_neg(0);
+  // CHECK: @llvm.hexagon.A2.neg
+  __builtin_HEXAGON_A2_negp(0);
+  // CHECK: @llvm.hexagon.A2.negp
+  __builtin_HEXAGON_A2_negsat(0);
+  // CHECK: @llvm.hexagon.A2.negsat
+  __builtin_HEXAGON_A2_not(0);
+  // CHECK: @llvm.hexagon.A2.not
+  __builtin_HEXAGON_A2_notp(0);
+  // CHECK: @llvm.hexagon.A2.notp
+  __builtin_HEXAGON_A2_or(0, 0);
+  // CHECK: @llvm.hexagon.A2.or
+  __builtin_HEXAGON_A2_orir(0, 0);
+  // CHECK: @llvm.hexagon.A2.orir
+  __builtin_HEXAGON_A2_orp(0, 0);
+  // CHECK: @llvm.hexagon.A2.orp
+  __builtin_HEXAGON_A2_roundsat(0);
+  // CHECK: @llvm.hexagon.A2.roundsat
+  __builtin_HEXAGON_A2_sat(0);
+  // CHECK: @llvm.hexagon.A2.sat
+  __builtin_HEXAGON_A2_satb(0);
+  // CHECK: @llvm.hexagon.A2.satb
+  __builtin_HEXAGON_A2_sath(0);
+  // CHECK: @llvm.hexagon.A2.sath
+  __builtin_HEXAGON_A2_satub(0);
+  // CHECK: @llvm.hexagon.A2.satub
+  __builtin_HEXAGON_A2_satuh(0);
+  // CHECK: @llvm.hexagon.A2.satuh
+  __builtin_HEXAGON_A2_sub(0, 0);
+  // CHECK: @llvm.hexagon.A2.sub
+  __builtin_HEXAGON_A2_subh_h16_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.hh
+  __builtin_HEXAGON_A2_subh_h16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.hl
+  __builtin_HEXAGON_A2_subh_h16_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.lh
+  __builtin_HEXAGON_A2_subh_h16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.ll
+  __builtin_HEXAGON_A2_subh_h16_sat_hh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.hh
+  __builtin_HEXAGON_A2_subh_h16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.hl
+  __builtin_HEXAGON_A2_subh_h16_sat_lh(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.lh
+  __builtin_HEXAGON_A2_subh_h16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.h16.sat.ll
+  __builtin_HEXAGON_A2_subh_l16_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.hl
+  __builtin_HEXAGON_A2_subh_l16_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.ll
+  __builtin_HEXAGON_A2_subh_l16_sat_hl(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.sat.hl
+  __builtin_HEXAGON_A2_subh_l16_sat_ll(0, 0);
+  // CHECK: @llvm.hexagon.A2.subh.l16.sat.ll
+  __builtin_HEXAGON_A2_subp(0, 0);
+  // CHECK: @llvm.hexagon.A2.subp
+  __builtin_HEXAGON_A2_subri(0, 0);
+  // CHECK: @llvm.hexagon.A2.subri
+  __builtin_HEXAGON_A2_subsat(0, 0);
+  // CHECK: @llvm.hexagon.A2.subsat
+  __builtin_HEXAGON_A2_svaddh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svaddh
+  __builtin_HEXAGON_A2_svaddhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svaddhs
+  __builtin_HEXAGON_A2_svadduhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svadduhs
+  __builtin_HEXAGON_A2_svavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svavgh
+  __builtin_HEXAGON_A2_svavghs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svavghs
+  __builtin_HEXAGON_A2_svnavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svnavgh
+  __builtin_HEXAGON_A2_svsubh(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubh
+  __builtin_HEXAGON_A2_svsubhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubhs
+  __builtin_HEXAGON_A2_svsubuhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.svsubuhs
+  __builtin_HEXAGON_A2_swiz(0);
+  // CHECK: @llvm.hexagon.A2.swiz
+  __builtin_HEXAGON_A2_sxtb(0);
+  // CHECK: @llvm.hexagon.A2.sxtb
+  __builtin_HEXAGON_A2_sxth(0);
+  // CHECK: @llvm.hexagon.A2.sxth
+  __builtin_HEXAGON_A2_sxtw(0);
+  // CHECK: @llvm.hexagon.A2.sxtw
+  __builtin_HEXAGON_A2_tfr(0);
+  // CHECK: @llvm.hexagon.A2.tfr
+  __builtin_HEXAGON_A2_tfrih(0, 0);
+  // CHECK: @llvm.hexagon.A2.tfrih
+  __builtin_HEXAGON_A2_tfril(0, 0);
+  // CHECK: @llvm.hexagon.A2.tfril
+  __builtin_HEXAGON_A2_tfrp(0);
+  // CHECK: @llvm.hexagon.A2.tfrp
+  __builtin_HEXAGON_A2_tfrpi(0);
+  // CHECK: @llvm.hexagon.A2.tfrpi
+  __builtin_HEXAGON_A2_tfrsi(0);
+  // CHECK: @llvm.hexagon.A2.tfrsi
+  __builtin_HEXAGON_A2_vabsh(0);
+  // CHECK: @llvm.hexagon.A2.vabsh
+  __builtin_HEXAGON_A2_vabshsat(0);
+  // CHECK: @llvm.hexagon.A2.vabshsat
+  __builtin_HEXAGON_A2_vabsw(0);
+  // CHECK: @llvm.hexagon.A2.vabsw
+  __builtin_HEXAGON_A2_vabswsat(0);
+  // CHECK: @llvm.hexagon.A2.vabswsat
+  __builtin_HEXAGON_A2_vaddb_map(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddb.map
+  __builtin_HEXAGON_A2_vaddh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddh
+  __builtin_HEXAGON_A2_vaddhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddhs
+  __builtin_HEXAGON_A2_vaddub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddub
+  __builtin_HEXAGON_A2_vaddubs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddubs
+  __builtin_HEXAGON_A2_vadduhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vadduhs
+  __builtin_HEXAGON_A2_vaddw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddw
+  __builtin_HEXAGON_A2_vaddws(0, 0);
+  // CHECK: @llvm.hexagon.A2.vaddws
+  __builtin_HEXAGON_A2_vavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgh
+  __builtin_HEXAGON_A2_vavghcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavghcr
+  __builtin_HEXAGON_A2_vavghr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavghr
+  __builtin_HEXAGON_A2_vavgub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgub
+  __builtin_HEXAGON_A2_vavgubr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgubr
+  __builtin_HEXAGON_A2_vavguh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguh
+  __builtin_HEXAGON_A2_vavguhr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguhr
+  __builtin_HEXAGON_A2_vavguw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguw
+  __builtin_HEXAGON_A2_vavguwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavguwr
+  __builtin_HEXAGON_A2_vavgw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgw
+  __builtin_HEXAGON_A2_vavgwcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgwcr
+  __builtin_HEXAGON_A2_vavgwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vavgwr
+  __builtin_HEXAGON_A2_vcmpbeq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpbeq
+  __builtin_HEXAGON_A2_vcmpbgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpbgtu
+  __builtin_HEXAGON_A2_vcmpheq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpheq
+  __builtin_HEXAGON_A2_vcmphgt(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmphgt
+  __builtin_HEXAGON_A2_vcmphgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmphgtu
+  __builtin_HEXAGON_A2_vcmpweq(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpweq
+  __builtin_HEXAGON_A2_vcmpwgt(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpwgt
+  __builtin_HEXAGON_A2_vcmpwgtu(0, 0);
+  // CHECK: @llvm.hexagon.A2.vcmpwgtu
+  __builtin_HEXAGON_A2_vconj(0);
+  // CHECK: @llvm.hexagon.A2.vconj
+  __builtin_HEXAGON_A2_vmaxb(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxb
+  __builtin_HEXAGON_A2_vmaxh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxh
+  __builtin_HEXAGON_A2_vmaxub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxub
+  __builtin_HEXAGON_A2_vmaxuh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxuh
+  __builtin_HEXAGON_A2_vmaxuw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxuw
+  __builtin_HEXAGON_A2_vmaxw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vmaxw
+  __builtin_HEXAGON_A2_vminb(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminb
+  __builtin_HEXAGON_A2_vminh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminh
+  __builtin_HEXAGON_A2_vminub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminub
+  __builtin_HEXAGON_A2_vminuh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminuh
+  __builtin_HEXAGON_A2_vminuw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminuw
+  __builtin_HEXAGON_A2_vminw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vminw
+  __builtin_HEXAGON_A2_vnavgh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgh
+  __builtin_HEXAGON_A2_vnavghcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavghcr
+  __builtin_HEXAGON_A2_vnavghr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavghr
+  __builtin_HEXAGON_A2_vnavgw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgw
+  __builtin_HEXAGON_A2_vnavgwcr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgwcr
+  __builtin_HEXAGON_A2_vnavgwr(0, 0);
+  // CHECK: @llvm.hexagon.A2.vnavgwr
+  __builtin_HEXAGON_A2_vraddub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vraddub
+  __builtin_HEXAGON_A2_vraddub_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.A2.vraddub.acc
+  __builtin_HEXAGON_A2_vrsadub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vrsadub
+  __builtin_HEXAGON_A2_vrsadub_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.A2.vrsadub.acc
+  __builtin_HEXAGON_A2_vsubb_map(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubb.map
+  __builtin_HEXAGON_A2_vsubh(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubh
+  __builtin_HEXAGON_A2_vsubhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubhs
+  __builtin_HEXAGON_A2_vsubub(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubub
+  __builtin_HEXAGON_A2_vsububs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsububs
+  __builtin_HEXAGON_A2_vsubuhs(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubuhs
+  __builtin_HEXAGON_A2_vsubw(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubw
+  __builtin_HEXAGON_A2_vsubws(0, 0);
+  // CHECK: @llvm.hexagon.A2.vsubws
+  __builtin_HEXAGON_A2_xor(0, 0);
+  // CHECK: @llvm.hexagon.A2.xor
+  __builtin_HEXAGON_A2_xorp(0, 0);
+  // CHECK: @llvm.hexagon.A2.xorp
+  __builtin_HEXAGON_A2_zxtb(0);
+  // CHECK: @llvm.hexagon.A2.zxtb
+  __builtin_HEXAGON_A2_zxth(0);
+  // CHECK: @llvm.hexagon.A2.zxth
+  __builtin_HEXAGON_A4_andn(0, 0);
+  // CHECK: @llvm.hexagon.A4.andn
+  __builtin_HEXAGON_A4_andnp(0, 0);
+  // CHECK: @llvm.hexagon.A4.andnp
+  __builtin_HEXAGON_A4_bitsplit(0, 0);
+  // CHECK: @llvm.hexagon.A4.bitsplit
+  __builtin_HEXAGON_A4_bitspliti(0, 0);
+  // CHECK: @llvm.hexagon.A4.bitspliti
+  __builtin_HEXAGON_A4_boundscheck(0, 0);
+  // CHECK: @llvm.hexagon.A4.boundscheck
+  __builtin_HEXAGON_A4_cmpbeq(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbeq
+  __builtin_HEXAGON_A4_cmpbeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbeqi
+  __builtin_HEXAGON_A4_cmpbgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgt
+  __builtin_HEXAGON_A4_cmpbgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgti
+  __builtin_HEXAGON_A4_cmpbgtu(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgtu
+  __builtin_HEXAGON_A4_cmpbgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpbgtui
+  __builtin_HEXAGON_A4_cmpheq(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpheq
+  __builtin_HEXAGON_A4_cmpheqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmpheqi
+  __builtin_HEXAGON_A4_cmphgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgt
+  __builtin_HEXAGON_A4_cmphgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgti
+  __builtin_HEXAGON_A4_cmphgtu(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgtu
+  __builtin_HEXAGON_A4_cmphgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.cmphgtui
+  __builtin_HEXAGON_A4_combineir(0, 0);
+  // CHECK: @llvm.hexagon.A4.combineir
+  __builtin_HEXAGON_A4_combineri(0, 0);
+  // CHECK: @llvm.hexagon.A4.combineri
+  __builtin_HEXAGON_A4_cround_ri(0, 0);
+  // CHECK: @llvm.hexagon.A4.cround.ri
+  __builtin_HEXAGON_A4_cround_rr(0, 0);
+  // CHECK: @llvm.hexagon.A4.cround.rr
+  __builtin_HEXAGON_A4_modwrapu(0, 0);
+  // CHECK: @llvm.hexagon.A4.modwrapu
+  __builtin_HEXAGON_A4_orn(0, 0);
+  // CHECK: @llvm.hexagon.A4.orn
+  __builtin_HEXAGON_A4_ornp(0, 0);
+  // CHECK: @llvm.hexagon.A4.ornp
+  __builtin_HEXAGON_A4_rcmpeq(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpeq
+  __builtin_HEXAGON_A4_rcmpeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpeqi
+  __builtin_HEXAGON_A4_rcmpneq(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpneq
+  __builtin_HEXAGON_A4_rcmpneqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.rcmpneqi
+  __builtin_HEXAGON_A4_round_ri(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.ri
+  __builtin_HEXAGON_A4_round_ri_sat(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.ri.sat
+  __builtin_HEXAGON_A4_round_rr(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.rr
+  __builtin_HEXAGON_A4_round_rr_sat(0, 0);
+  // CHECK: @llvm.hexagon.A4.round.rr.sat
+  __builtin_HEXAGON_A4_tlbmatch(0, 0);
+  // CHECK: @llvm.hexagon.A4.tlbmatch
+  __builtin_HEXAGON_A4_vcmpbeq_any(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbeq.any
+  __builtin_HEXAGON_A4_vcmpbeqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbeqi
+  __builtin_HEXAGON_A4_vcmpbgt(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgt
+  __builtin_HEXAGON_A4_vcmpbgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgti
+  __builtin_HEXAGON_A4_vcmpbgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpbgtui
+  __builtin_HEXAGON_A4_vcmpheqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpheqi
+  __builtin_HEXAGON_A4_vcmphgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmphgti
+  __builtin_HEXAGON_A4_vcmphgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmphgtui
+  __builtin_HEXAGON_A4_vcmpweqi(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpweqi
+  __builtin_HEXAGON_A4_vcmpwgti(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpwgti
+  __builtin_HEXAGON_A4_vcmpwgtui(0, 0);
+  // CHECK: @llvm.hexagon.A4.vcmpwgtui
+  __builtin_HEXAGON_A4_vrmaxh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxh
+  __builtin_HEXAGON_A4_vrmaxuh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxuh
+  __builtin_HEXAGON_A4_vrmaxuw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxuw
+  __builtin_HEXAGON_A4_vrmaxw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrmaxw
+  __builtin_HEXAGON_A4_vrminh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminh
+  __builtin_HEXAGON_A4_vrminuh(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminuh
+  __builtin_HEXAGON_A4_vrminuw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminuw
+  __builtin_HEXAGON_A4_vrminw(0, 0, 0);
+  // CHECK: @llvm.hexagon.A4.vrminw
+  __builtin_HEXAGON_A5_vaddhubs(0, 0);
+  // CHECK: @llvm.hexagon.A5.vaddhubs
+  __builtin_HEXAGON_C2_all8(0);
+  // CHECK: @llvm.hexagon.C2.all8
+  __builtin_HEXAGON_C2_and(0, 0);
+  // CHECK: @llvm.hexagon.C2.and
+  __builtin_HEXAGON_C2_andn(0, 0);
+  // CHECK: @llvm.hexagon.C2.andn
+  __builtin_HEXAGON_C2_any8(0);
+  // CHECK: @llvm.hexagon.C2.any8
+  __builtin_HEXAGON_C2_bitsclr(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsclr
+  __builtin_HEXAGON_C2_bitsclri(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsclri
+  __builtin_HEXAGON_C2_bitsset(0, 0);
+  // CHECK: @llvm.hexagon.C2.bitsset
+  __builtin_HEXAGON_C2_cmpeq(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeq
+  __builtin_HEXAGON_C2_cmpeqi(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeqi
+  __builtin_HEXAGON_C2_cmpeqp(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpeqp
+  __builtin_HEXAGON_C2_cmpgei(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgei
+  __builtin_HEXAGON_C2_cmpgeui(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgeui
+  __builtin_HEXAGON_C2_cmpgt(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgt
+  __builtin_HEXAGON_C2_cmpgti(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgti
+  __builtin_HEXAGON_C2_cmpgtp(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtp
+  __builtin_HEXAGON_C2_cmpgtu(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtu
+  __builtin_HEXAGON_C2_cmpgtui(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtui
+  __builtin_HEXAGON_C2_cmpgtup(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpgtup
+  __builtin_HEXAGON_C2_cmplt(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmplt
+  __builtin_HEXAGON_C2_cmpltu(0, 0);
+  // CHECK: @llvm.hexagon.C2.cmpltu
+  __builtin_HEXAGON_C2_mask(0);
+  // CHECK: @llvm.hexagon.C2.mask
+  __builtin_HEXAGON_C2_mux(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.mux
+  __builtin_HEXAGON_C2_muxii(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxii
+  __builtin_HEXAGON_C2_muxir(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxir
+  __builtin_HEXAGON_C2_muxri(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.muxri
+  __builtin_HEXAGON_C2_not(0);
+  // CHECK: @llvm.hexagon.C2.not
+  __builtin_HEXAGON_C2_or (0, 0);
+  // CHECK: @llvm.hexagon.C2.or 
+  __builtin_HEXAGON_C2_orn(0, 0);
+  // CHECK: @llvm.hexagon.C2.orn
+  __builtin_HEXAGON_C2_pxfer_map(0);
+  // CHECK: @llvm.hexagon.C2.pxfer.map
+  __builtin_HEXAGON_C2_tfrpr(0);
+  // CHECK: @llvm.hexagon.C2.tfrpr
+  __builtin_HEXAGON_C2_tfrrp(0);
+  // CHECK: @llvm.hexagon.C2.tfrrp
+  __builtin_HEXAGON_C2_vitpack(0, 0);
+  // CHECK: @llvm.hexagon.C2.vitpack
+  __builtin_HEXAGON_C2_vmux(0, 0, 0);
+  // CHECK: @llvm.hexagon.C2.vmux
+  __builtin_HEXAGON_C2_xor(0, 0);
+  // CHECK: @llvm.hexagon.C2.xor
+  __builtin_HEXAGON_C4_and_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.and
+  __builtin_HEXAGON_C4_and_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.andn
+  __builtin_HEXAGON_C4_and_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.or
+  __builtin_HEXAGON_C4_and_orn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.and.orn
+  __builtin_HEXAGON_C4_cmplte(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplte
+  __builtin_HEXAGON_C4_cmpltei(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpltei
+  __builtin_HEXAGON_C4_cmplteu(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplteu
+  __builtin_HEXAGON_C4_cmplteui(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmplteui
+  __builtin_HEXAGON_C4_cmpneq(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpneq
+  __builtin_HEXAGON_C4_cmpneqi(0, 0);
+  // CHECK: @llvm.hexagon.C4.cmpneqi
+  __builtin_HEXAGON_C4_fastcorner9(0, 0);
+  // CHECK: @llvm.hexagon.C4.fastcorner9
+  __builtin_HEXAGON_C4_fastcorner9_not(0, 0);
+  // CHECK: @llvm.hexagon.C4.fastcorner9.not
+  __builtin_HEXAGON_C4_nbitsclr(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsclr
+  __builtin_HEXAGON_C4_nbitsclri(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsclri
+  __builtin_HEXAGON_C4_nbitsset(0, 0);
+  // CHECK: @llvm.hexagon.C4.nbitsset
+  __builtin_HEXAGON_C4_or_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.and
+  __builtin_HEXAGON_C4_or_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.andn
+  __builtin_HEXAGON_C4_or_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.or
+  __builtin_HEXAGON_C4_or_orn(0, 0, 0);
+  // CHECK: @llvm.hexagon.C4.or.orn
+  __builtin_HEXAGON_F2_conv_d2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.d2df
+  __builtin_HEXAGON_F2_conv_d2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.d2sf
+  __builtin_HEXAGON_F2_conv_df2d(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2d
+  __builtin_HEXAGON_F2_conv_df2d_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2d.chop
+  __builtin_HEXAGON_F2_conv_df2sf(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2sf
+  __builtin_HEXAGON_F2_conv_df2ud(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2ud
+  __builtin_HEXAGON_F2_conv_df2ud_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2ud.chop
+  __builtin_HEXAGON_F2_conv_df2uw(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2uw
+  __builtin_HEXAGON_F2_conv_df2uw_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2uw.chop
+  __builtin_HEXAGON_F2_conv_df2w(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2w
+  __builtin_HEXAGON_F2_conv_df2w_chop(0.0);
+  // CHECK: @llvm.hexagon.F2.conv.df2w.chop
+  __builtin_HEXAGON_F2_conv_sf2d(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2d
+  __builtin_HEXAGON_F2_conv_sf2d_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2d.chop
+  __builtin_HEXAGON_F2_conv_sf2df(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2df
+  __builtin_HEXAGON_F2_conv_sf2ud(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2ud
+  __builtin_HEXAGON_F2_conv_sf2ud_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2ud.chop
+  __builtin_HEXAGON_F2_conv_sf2uw(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2uw
+  __builtin_HEXAGON_F2_conv_sf2uw_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2uw.chop
+  __builtin_HEXAGON_F2_conv_sf2w(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2w
+  __builtin_HEXAGON_F2_conv_sf2w_chop(0.0f);
+  // CHECK: @llvm.hexagon.F2.conv.sf2w.chop
+  __builtin_HEXAGON_F2_conv_ud2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.ud2df
+  __builtin_HEXAGON_F2_conv_ud2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.ud2sf
+  __builtin_HEXAGON_F2_conv_uw2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.uw2df
+  __builtin_HEXAGON_F2_conv_uw2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.uw2sf
+  __builtin_HEXAGON_F2_conv_w2df(0);
+  // CHECK: @llvm.hexagon.F2.conv.w2df
+  __builtin_HEXAGON_F2_conv_w2sf(0);
+  // CHECK: @llvm.hexagon.F2.conv.w2sf
+  __builtin_HEXAGON_F2_dfclass(0.0, 0);
+  // CHECK: @llvm.hexagon.F2.dfclass
+  __builtin_HEXAGON_F2_dfcmpeq(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpeq
+  __builtin_HEXAGON_F2_dfcmpge(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpge
+  __builtin_HEXAGON_F2_dfcmpgt(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpgt
+  __builtin_HEXAGON_F2_dfcmpuo(0.0, 0.0);
+  // CHECK: @llvm.hexagon.F2.dfcmpuo
+  __builtin_HEXAGON_F2_dfimm_n(0);
+  // CHECK: @llvm.hexagon.F2.dfimm.n
+  __builtin_HEXAGON_F2_dfimm_p(0);
+  // CHECK: @llvm.hexagon.F2.dfimm.p
+  __builtin_HEXAGON_F2_sfadd(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfadd
+  __builtin_HEXAGON_F2_sfclass(0.0f, 0);
+  // CHECK: @llvm.hexagon.F2.sfclass
+  __builtin_HEXAGON_F2_sfcmpeq(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpeq
+  __builtin_HEXAGON_F2_sfcmpge(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpge
+  __builtin_HEXAGON_F2_sfcmpgt(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpgt
+  __builtin_HEXAGON_F2_sfcmpuo(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfcmpuo
+  __builtin_HEXAGON_F2_sffixupd(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupd
+  __builtin_HEXAGON_F2_sffixupn(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupn
+  __builtin_HEXAGON_F2_sffixupr(0.0f);
+  // CHECK: @llvm.hexagon.F2.sffixupr
+  __builtin_HEXAGON_F2_sffma(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffma
+  __builtin_HEXAGON_F2_sffma_lib(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffma.lib
+  __builtin_HEXAGON_F2_sffma_sc(0.0f, 0.0f, 0.0f, 0);
+  // CHECK: @llvm.hexagon.F2.sffma.sc
+  __builtin_HEXAGON_F2_sffms(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffms
+  __builtin_HEXAGON_F2_sffms_lib(0.0f, 0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sffms.lib
+  __builtin_HEXAGON_F2_sfimm_n(0);
+  // CHECK: @llvm.hexagon.F2.sfimm.n
+  __builtin_HEXAGON_F2_sfimm_p(0);
+  // CHECK: @llvm.hexagon.F2.sfimm.p
+  __builtin_HEXAGON_F2_sfmax(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmax
+  __builtin_HEXAGON_F2_sfmin(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmin
+  __builtin_HEXAGON_F2_sfmpy(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfmpy
+  __builtin_HEXAGON_F2_sfsub(0.0f, 0.0f);
+  // CHECK: @llvm.hexagon.F2.sfsub
+  __builtin_HEXAGON_M2_acci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.acci
+  __builtin_HEXAGON_M2_accii(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.accii
+  __builtin_HEXAGON_M2_cmaci_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmaci.s0
+  __builtin_HEXAGON_M2_cmacr_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacr.s0
+  __builtin_HEXAGON_M2_cmacsc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacsc.s0
+  __builtin_HEXAGON_M2_cmacsc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacsc.s1
+  __builtin_HEXAGON_M2_cmacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacs.s0
+  __builtin_HEXAGON_M2_cmacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cmacs.s1
+  __builtin_HEXAGON_M2_cmpyi_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyi.s0
+  __builtin_HEXAGON_M2_cmpyr_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyr.s0
+  __builtin_HEXAGON_M2_cmpyrsc_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrsc.s0
+  __builtin_HEXAGON_M2_cmpyrsc_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrsc.s1
+  __builtin_HEXAGON_M2_cmpyrs_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrs.s0
+  __builtin_HEXAGON_M2_cmpyrs_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpyrs.s1
+  __builtin_HEXAGON_M2_cmpysc_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpysc.s0
+  __builtin_HEXAGON_M2_cmpysc_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpysc.s1
+  __builtin_HEXAGON_M2_cmpys_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpys.s0
+  __builtin_HEXAGON_M2_cmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.cmpys.s1
+  __builtin_HEXAGON_M2_cnacsc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacsc.s0
+  __builtin_HEXAGON_M2_cnacsc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacsc.s1
+  __builtin_HEXAGON_M2_cnacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacs.s0
+  __builtin_HEXAGON_M2_cnacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.cnacs.s1
+  __builtin_HEXAGON_M2_dpmpyss_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.acc.s0
+  __builtin_HEXAGON_M2_dpmpyss_nac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.nac.s0
+  __builtin_HEXAGON_M2_dpmpyss_rnd_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.rnd.s0
+  __builtin_HEXAGON_M2_dpmpyss_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyss.s0
+  __builtin_HEXAGON_M2_dpmpyuu_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.acc.s0
+  __builtin_HEXAGON_M2_dpmpyuu_nac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.nac.s0
+  __builtin_HEXAGON_M2_dpmpyuu_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.dpmpyuu.s0
+  __builtin_HEXAGON_M2_hmmpyh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyh.rs1
+  __builtin_HEXAGON_M2_hmmpyh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyh.s1
+  __builtin_HEXAGON_M2_hmmpyl_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyl.rs1
+  __builtin_HEXAGON_M2_hmmpyl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.hmmpyl.s1
+  __builtin_HEXAGON_M2_maci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.maci
+  __builtin_HEXAGON_M2_macsin(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.macsin
+  __builtin_HEXAGON_M2_macsip(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.macsip
+  __builtin_HEXAGON_M2_mmachs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.rs0
+  __builtin_HEXAGON_M2_mmachs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.rs1
+  __builtin_HEXAGON_M2_mmachs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.s0
+  __builtin_HEXAGON_M2_mmachs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmachs.s1
+  __builtin_HEXAGON_M2_mmacls_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.rs0
+  __builtin_HEXAGON_M2_mmacls_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.rs1
+  __builtin_HEXAGON_M2_mmacls_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.s0
+  __builtin_HEXAGON_M2_mmacls_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacls.s1
+  __builtin_HEXAGON_M2_mmacuhs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.rs0
+  __builtin_HEXAGON_M2_mmacuhs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.rs1
+  __builtin_HEXAGON_M2_mmacuhs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.s0
+  __builtin_HEXAGON_M2_mmacuhs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmacuhs.s1
+  __builtin_HEXAGON_M2_mmaculs_rs0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.rs0
+  __builtin_HEXAGON_M2_mmaculs_rs1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.rs1
+  __builtin_HEXAGON_M2_mmaculs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.s0
+  __builtin_HEXAGON_M2_mmaculs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mmaculs.s1
+  __builtin_HEXAGON_M2_mmpyh_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.rs0
+  __builtin_HEXAGON_M2_mmpyh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.rs1
+  __builtin_HEXAGON_M2_mmpyh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.s0
+  __builtin_HEXAGON_M2_mmpyh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyh.s1
+  __builtin_HEXAGON_M2_mmpyl_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.rs0
+  __builtin_HEXAGON_M2_mmpyl_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.rs1
+  __builtin_HEXAGON_M2_mmpyl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.s0
+  __builtin_HEXAGON_M2_mmpyl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyl.s1
+  __builtin_HEXAGON_M2_mmpyuh_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.rs0
+  __builtin_HEXAGON_M2_mmpyuh_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.rs1
+  __builtin_HEXAGON_M2_mmpyuh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.s0
+  __builtin_HEXAGON_M2_mmpyuh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyuh.s1
+  __builtin_HEXAGON_M2_mmpyul_rs0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.rs0
+  __builtin_HEXAGON_M2_mmpyul_rs1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.rs1
+  __builtin_HEXAGON_M2_mmpyul_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.s0
+  __builtin_HEXAGON_M2_mmpyul_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mmpyul.s1
+  __builtin_HEXAGON_M2_mpy_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hh.s0
+  __builtin_HEXAGON_M2_mpy_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hh.s1
+  __builtin_HEXAGON_M2_mpy_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hl.s0
+  __builtin_HEXAGON_M2_mpy_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.hl.s1
+  __builtin_HEXAGON_M2_mpy_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.lh.s0
+  __builtin_HEXAGON_M2_mpy_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.lh.s1
+  __builtin_HEXAGON_M2_mpy_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.ll.s0
+  __builtin_HEXAGON_M2_mpy_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.ll.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_acc_sat_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_acc_sat_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.acc.sat.ll.s1
+  __builtin_HEXAGON_M2_mpyd_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyd_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyd_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyd_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyd_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyd_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyd_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyd_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hh.s0
+  __builtin_HEXAGON_M2_mpyd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hh.s1
+  __builtin_HEXAGON_M2_mpyd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hl.s0
+  __builtin_HEXAGON_M2_mpyd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.hl.s1
+  __builtin_HEXAGON_M2_mpyd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.lh.s0
+  __builtin_HEXAGON_M2_mpyd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.lh.s1
+  __builtin_HEXAGON_M2_mpyd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.ll.s0
+  __builtin_HEXAGON_M2_mpyd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.ll.s1
+  __builtin_HEXAGON_M2_mpyd_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyd_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyd_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyd_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyd_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyd_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyd_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyd_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.nac.ll.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpyd_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpyd_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyd.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpy_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hh.s0
+  __builtin_HEXAGON_M2_mpy_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hh.s1
+  __builtin_HEXAGON_M2_mpy_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hl.s0
+  __builtin_HEXAGON_M2_mpy_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.hl.s1
+  __builtin_HEXAGON_M2_mpyi(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyi
+  __builtin_HEXAGON_M2_mpy_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.lh.s0
+  __builtin_HEXAGON_M2_mpy_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.lh.s1
+  __builtin_HEXAGON_M2_mpy_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.ll.s0
+  __builtin_HEXAGON_M2_mpy_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.ll.s1
+  __builtin_HEXAGON_M2_mpy_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hh.s0
+  __builtin_HEXAGON_M2_mpy_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hh.s1
+  __builtin_HEXAGON_M2_mpy_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hl.s0
+  __builtin_HEXAGON_M2_mpy_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.hl.s1
+  __builtin_HEXAGON_M2_mpy_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.lh.s0
+  __builtin_HEXAGON_M2_mpy_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.lh.s1
+  __builtin_HEXAGON_M2_mpy_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.ll.s0
+  __builtin_HEXAGON_M2_mpy_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.ll.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_nac_sat_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_nac_sat_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.nac.sat.ll.s1
+  __builtin_HEXAGON_M2_mpy_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpy_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpy_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpy_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpy_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpy_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpy_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpy_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpy_sat_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hh.s0
+  __builtin_HEXAGON_M2_mpy_sat_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hh.s1
+  __builtin_HEXAGON_M2_mpy_sat_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hl.s0
+  __builtin_HEXAGON_M2_mpy_sat_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.hl.s1
+  __builtin_HEXAGON_M2_mpy_sat_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.lh.s0
+  __builtin_HEXAGON_M2_mpy_sat_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.lh.s1
+  __builtin_HEXAGON_M2_mpy_sat_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.ll.s0
+  __builtin_HEXAGON_M2_mpy_sat_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.ll.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hh.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hh.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hl.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.hl.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.lh.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.lh.s1
+  __builtin_HEXAGON_M2_mpy_sat_rnd_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.ll.s0
+  __builtin_HEXAGON_M2_mpy_sat_rnd_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.sat.rnd.ll.s1
+  __builtin_HEXAGON_M2_mpysmi(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpysmi
+  __builtin_HEXAGON_M2_mpysu_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpysu.up
+  __builtin_HEXAGON_M2_mpyu_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyu_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyu_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyu_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyu_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyu_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyu_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyu_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyud_acc_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hh.s0
+  __builtin_HEXAGON_M2_mpyud_acc_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hh.s1
+  __builtin_HEXAGON_M2_mpyud_acc_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hl.s0
+  __builtin_HEXAGON_M2_mpyud_acc_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.hl.s1
+  __builtin_HEXAGON_M2_mpyud_acc_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.lh.s0
+  __builtin_HEXAGON_M2_mpyud_acc_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.lh.s1
+  __builtin_HEXAGON_M2_mpyud_acc_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.ll.s0
+  __builtin_HEXAGON_M2_mpyud_acc_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.acc.ll.s1
+  __builtin_HEXAGON_M2_mpyud_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hh.s0
+  __builtin_HEXAGON_M2_mpyud_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hh.s1
+  __builtin_HEXAGON_M2_mpyud_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hl.s0
+  __builtin_HEXAGON_M2_mpyud_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.hl.s1
+  __builtin_HEXAGON_M2_mpyud_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.lh.s0
+  __builtin_HEXAGON_M2_mpyud_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.lh.s1
+  __builtin_HEXAGON_M2_mpyud_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.ll.s0
+  __builtin_HEXAGON_M2_mpyud_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.ll.s1
+  __builtin_HEXAGON_M2_mpyud_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyud_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyud_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyud_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyud_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyud_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyud_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyud_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyud.nac.ll.s1
+  __builtin_HEXAGON_M2_mpyu_hh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hh.s0
+  __builtin_HEXAGON_M2_mpyu_hh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hh.s1
+  __builtin_HEXAGON_M2_mpyu_hl_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hl.s0
+  __builtin_HEXAGON_M2_mpyu_hl_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.hl.s1
+  __builtin_HEXAGON_M2_mpyui(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyui
+  __builtin_HEXAGON_M2_mpyu_lh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.lh.s0
+  __builtin_HEXAGON_M2_mpyu_lh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.lh.s1
+  __builtin_HEXAGON_M2_mpyu_ll_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.ll.s0
+  __builtin_HEXAGON_M2_mpyu_ll_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.ll.s1
+  __builtin_HEXAGON_M2_mpyu_nac_hh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hh.s0
+  __builtin_HEXAGON_M2_mpyu_nac_hh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hh.s1
+  __builtin_HEXAGON_M2_mpyu_nac_hl_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hl.s0
+  __builtin_HEXAGON_M2_mpyu_nac_hl_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.hl.s1
+  __builtin_HEXAGON_M2_mpyu_nac_lh_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.lh.s0
+  __builtin_HEXAGON_M2_mpyu_nac_lh_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.lh.s1
+  __builtin_HEXAGON_M2_mpyu_nac_ll_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.ll.s0
+  __builtin_HEXAGON_M2_mpyu_nac_ll_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.nac.ll.s1
+  __builtin_HEXAGON_M2_mpy_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up
+  __builtin_HEXAGON_M2_mpy_up_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up.s1
+  __builtin_HEXAGON_M2_mpy_up_s1_sat(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpy.up.s1.sat
+  __builtin_HEXAGON_M2_mpyu_up(0, 0);
+  // CHECK: @llvm.hexagon.M2.mpyu.up
+  __builtin_HEXAGON_M2_nacci(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.nacci
+  __builtin_HEXAGON_M2_naccii(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.naccii
+  __builtin_HEXAGON_M2_subacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.subacc
+  __builtin_HEXAGON_M2_vabsdiffh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vabsdiffh
+  __builtin_HEXAGON_M2_vabsdiffw(0, 0);
+  // CHECK: @llvm.hexagon.M2.vabsdiffw
+  __builtin_HEXAGON_M2_vcmac_s0_sat_i(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmac.s0.sat.i
+  __builtin_HEXAGON_M2_vcmac_s0_sat_r(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmac.s0.sat.r
+  __builtin_HEXAGON_M2_vcmpy_s0_sat_i(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s0.sat.i
+  __builtin_HEXAGON_M2_vcmpy_s0_sat_r(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s0.sat.r
+  __builtin_HEXAGON_M2_vcmpy_s1_sat_i(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s1.sat.i
+  __builtin_HEXAGON_M2_vcmpy_s1_sat_r(0, 0);
+  // CHECK: @llvm.hexagon.M2.vcmpy.s1.sat.r
+  __builtin_HEXAGON_M2_vdmacs_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmacs.s0
+  __builtin_HEXAGON_M2_vdmacs_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmacs.s1
+  __builtin_HEXAGON_M2_vdmpyrs_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpyrs.s0
+  __builtin_HEXAGON_M2_vdmpyrs_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpyrs.s1
+  __builtin_HEXAGON_M2_vdmpys_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpys.s0
+  __builtin_HEXAGON_M2_vdmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vdmpys.s1
+  __builtin_HEXAGON_M2_vmac2(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2
+  __builtin_HEXAGON_M2_vmac2es(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es
+  __builtin_HEXAGON_M2_vmac2es_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es.s0
+  __builtin_HEXAGON_M2_vmac2es_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2es.s1
+  __builtin_HEXAGON_M2_vmac2s_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2s.s0
+  __builtin_HEXAGON_M2_vmac2s_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2s.s1
+  __builtin_HEXAGON_M2_vmac2su_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2su.s0
+  __builtin_HEXAGON_M2_vmac2su_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vmac2su.s1
+  __builtin_HEXAGON_M2_vmpy2es_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2es.s0
+  __builtin_HEXAGON_M2_vmpy2es_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2es.s1
+  __builtin_HEXAGON_M2_vmpy2s_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s0
+  __builtin_HEXAGON_M2_vmpy2s_s0pack(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s0pack
+  __builtin_HEXAGON_M2_vmpy2s_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s1
+  __builtin_HEXAGON_M2_vmpy2s_s1pack(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2s.s1pack
+  __builtin_HEXAGON_M2_vmpy2su_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2su.s0
+  __builtin_HEXAGON_M2_vmpy2su_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vmpy2su.s1
+  __builtin_HEXAGON_M2_vraddh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vraddh
+  __builtin_HEXAGON_M2_vradduh(0, 0);
+  // CHECK: @llvm.hexagon.M2.vradduh
+  __builtin_HEXAGON_M2_vrcmaci_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmaci.s0
+  __builtin_HEXAGON_M2_vrcmaci_s0c(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmaci.s0c
+  __builtin_HEXAGON_M2_vrcmacr_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmacr.s0
+  __builtin_HEXAGON_M2_vrcmacr_s0c(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmacr.s0c
+  __builtin_HEXAGON_M2_vrcmpyi_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyi.s0
+  __builtin_HEXAGON_M2_vrcmpyi_s0c(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyi.s0c
+  __builtin_HEXAGON_M2_vrcmpyr_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyr.s0
+  __builtin_HEXAGON_M2_vrcmpyr_s0c(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpyr.s0c
+  __builtin_HEXAGON_M2_vrcmpys_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.acc.s1
+  __builtin_HEXAGON_M2_vrcmpys_s1(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.s1
+  __builtin_HEXAGON_M2_vrcmpys_s1rp(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrcmpys.s1rp
+  __builtin_HEXAGON_M2_vrmac_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.vrmac.s0
+  __builtin_HEXAGON_M2_vrmpy_s0(0, 0);
+  // CHECK: @llvm.hexagon.M2.vrmpy.s0
+  __builtin_HEXAGON_M2_xor_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M2.xor.xacc
+  __builtin_HEXAGON_M4_and_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.and
+  __builtin_HEXAGON_M4_and_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.andn
+  __builtin_HEXAGON_M4_and_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.or
+  __builtin_HEXAGON_M4_and_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.and.xor
+  __builtin_HEXAGON_M4_cmpyi_wh(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyi.wh
+  __builtin_HEXAGON_M4_cmpyi_whc(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyi.whc
+  __builtin_HEXAGON_M4_cmpyr_wh(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyr.wh
+  __builtin_HEXAGON_M4_cmpyr_whc(0, 0);
+  // CHECK: @llvm.hexagon.M4.cmpyr.whc
+  __builtin_HEXAGON_M4_mac_up_s1_sat(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mac.up.s1.sat
+  __builtin_HEXAGON_M4_mpyri_addi(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addi
+  __builtin_HEXAGON_M4_mpyri_addr(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addr
+  __builtin_HEXAGON_M4_mpyri_addr_u2(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyri.addr.u2
+  __builtin_HEXAGON_M4_mpyrr_addi(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyrr.addi
+  __builtin_HEXAGON_M4_mpyrr_addr(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.mpyrr.addr
+  __builtin_HEXAGON_M4_nac_up_s1_sat(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.nac.up.s1.sat
+  __builtin_HEXAGON_M4_or_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.and
+  __builtin_HEXAGON_M4_or_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.andn
+  __builtin_HEXAGON_M4_or_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.or
+  __builtin_HEXAGON_M4_or_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.or.xor
+  __builtin_HEXAGON_M4_pmpyw(0, 0);
+  // CHECK: @llvm.hexagon.M4.pmpyw
+  __builtin_HEXAGON_M4_pmpyw_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.pmpyw.acc
+  __builtin_HEXAGON_M4_vpmpyh(0, 0);
+  // CHECK: @llvm.hexagon.M4.vpmpyh
+  __builtin_HEXAGON_M4_vpmpyh_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vpmpyh.acc
+  __builtin_HEXAGON_M4_vrmpyeh_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.acc.s0
+  __builtin_HEXAGON_M4_vrmpyeh_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.acc.s1
+  __builtin_HEXAGON_M4_vrmpyeh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.s0
+  __builtin_HEXAGON_M4_vrmpyeh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyeh.s1
+  __builtin_HEXAGON_M4_vrmpyoh_acc_s0(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.acc.s0
+  __builtin_HEXAGON_M4_vrmpyoh_acc_s1(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.acc.s1
+  __builtin_HEXAGON_M4_vrmpyoh_s0(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.s0
+  __builtin_HEXAGON_M4_vrmpyoh_s1(0, 0);
+  // CHECK: @llvm.hexagon.M4.vrmpyoh.s1
+  __builtin_HEXAGON_M4_xor_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.and
+  __builtin_HEXAGON_M4_xor_andn(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.andn
+  __builtin_HEXAGON_M4_xor_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.or
+  __builtin_HEXAGON_M4_xor_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.M4.xor.xacc
+  __builtin_HEXAGON_M5_vdmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vdmacbsu
+  __builtin_HEXAGON_M5_vdmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vdmpybsu
+  __builtin_HEXAGON_M5_vmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vmacbsu
+  __builtin_HEXAGON_M5_vmacbuu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vmacbuu
+  __builtin_HEXAGON_M5_vmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vmpybsu
+  __builtin_HEXAGON_M5_vmpybuu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vmpybuu
+  __builtin_HEXAGON_M5_vrmacbsu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmacbsu
+  __builtin_HEXAGON_M5_vrmacbuu(0, 0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmacbuu
+  __builtin_HEXAGON_M5_vrmpybsu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmpybsu
+  __builtin_HEXAGON_M5_vrmpybuu(0, 0);
+  // CHECK: @llvm.hexagon.M5.vrmpybuu
+  __builtin_HEXAGON_S2_addasl_rrri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.addasl.rrri
+  __builtin_HEXAGON_S2_asl_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p
+  __builtin_HEXAGON_S2_asl_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.acc
+  __builtin_HEXAGON_S2_asl_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.and
+  __builtin_HEXAGON_S2_asl_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.nac
+  __builtin_HEXAGON_S2_asl_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.or
+  __builtin_HEXAGON_S2_asl_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.p.xacc
+  __builtin_HEXAGON_S2_asl_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r
+  __builtin_HEXAGON_S2_asl_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.acc
+  __builtin_HEXAGON_S2_asl_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.and
+  __builtin_HEXAGON_S2_asl_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.nac
+  __builtin_HEXAGON_S2_asl_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.or
+  __builtin_HEXAGON_S2_asl_i_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.sat
+  __builtin_HEXAGON_S2_asl_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.r.xacc
+  __builtin_HEXAGON_S2_asl_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.vh
+  __builtin_HEXAGON_S2_asl_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.i.vw
+  __builtin_HEXAGON_S2_asl_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p
+  __builtin_HEXAGON_S2_asl_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.acc
+  __builtin_HEXAGON_S2_asl_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.and
+  __builtin_HEXAGON_S2_asl_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.nac
+  __builtin_HEXAGON_S2_asl_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.or
+  __builtin_HEXAGON_S2_asl_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.p.xor
+  __builtin_HEXAGON_S2_asl_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r
+  __builtin_HEXAGON_S2_asl_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.acc
+  __builtin_HEXAGON_S2_asl_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.and
+  __builtin_HEXAGON_S2_asl_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.nac
+  __builtin_HEXAGON_S2_asl_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.or
+  __builtin_HEXAGON_S2_asl_r_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.r.sat
+  __builtin_HEXAGON_S2_asl_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.vh
+  __builtin_HEXAGON_S2_asl_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asl.r.vw
+  __builtin_HEXAGON_S2_asr_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p
+  __builtin_HEXAGON_S2_asr_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.acc
+  __builtin_HEXAGON_S2_asr_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.and
+  __builtin_HEXAGON_S2_asr_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.nac
+  __builtin_HEXAGON_S2_asr_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.or
+  __builtin_HEXAGON_S2_asr_i_p_rnd(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.rnd
+  __builtin_HEXAGON_S2_asr_i_p_rnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.p.rnd.goodsyntax
+  __builtin_HEXAGON_S2_asr_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r
+  __builtin_HEXAGON_S2_asr_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.acc
+  __builtin_HEXAGON_S2_asr_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.and
+  __builtin_HEXAGON_S2_asr_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.nac
+  __builtin_HEXAGON_S2_asr_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.or
+  __builtin_HEXAGON_S2_asr_i_r_rnd(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.rnd
+  __builtin_HEXAGON_S2_asr_i_r_rnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.r.rnd.goodsyntax
+  __builtin_HEXAGON_S2_asr_i_svw_trun(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.svw.trun
+  __builtin_HEXAGON_S2_asr_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.vh
+  __builtin_HEXAGON_S2_asr_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.i.vw
+  __builtin_HEXAGON_S2_asr_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p
+  __builtin_HEXAGON_S2_asr_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.acc
+  __builtin_HEXAGON_S2_asr_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.and
+  __builtin_HEXAGON_S2_asr_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.nac
+  __builtin_HEXAGON_S2_asr_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.or
+  __builtin_HEXAGON_S2_asr_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.p.xor
+  __builtin_HEXAGON_S2_asr_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r
+  __builtin_HEXAGON_S2_asr_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.acc
+  __builtin_HEXAGON_S2_asr_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.and
+  __builtin_HEXAGON_S2_asr_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.nac
+  __builtin_HEXAGON_S2_asr_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.or
+  __builtin_HEXAGON_S2_asr_r_r_sat(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.r.sat
+  __builtin_HEXAGON_S2_asr_r_svw_trun(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.svw.trun
+  __builtin_HEXAGON_S2_asr_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.vh
+  __builtin_HEXAGON_S2_asr_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.asr.r.vw
+  __builtin_HEXAGON_S2_brev(0);
+  // CHECK: @llvm.hexagon.S2.brev
+  __builtin_HEXAGON_S2_brevp(0);
+  // CHECK: @llvm.hexagon.S2.brevp
+  __builtin_HEXAGON_S2_cabacencbin(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.cabacencbin
+  __builtin_HEXAGON_S2_cl0(0);
+  // CHECK: @llvm.hexagon.S2.cl0
+  __builtin_HEXAGON_S2_cl0p(0);
+  // CHECK: @llvm.hexagon.S2.cl0p
+  __builtin_HEXAGON_S2_cl1(0);
+  // CHECK: @llvm.hexagon.S2.cl1
+  __builtin_HEXAGON_S2_cl1p(0);
+  // CHECK: @llvm.hexagon.S2.cl1p
+  __builtin_HEXAGON_S2_clb(0);
+  // CHECK: @llvm.hexagon.S2.clb
+  __builtin_HEXAGON_S2_clbnorm(0);
+  // CHECK: @llvm.hexagon.S2.clbnorm
+  __builtin_HEXAGON_S2_clbp(0);
+  // CHECK: @llvm.hexagon.S2.clbp
+  __builtin_HEXAGON_S2_clrbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.clrbit.i
+  __builtin_HEXAGON_S2_clrbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.clrbit.r
+  __builtin_HEXAGON_S2_ct0(0);
+  // CHECK: @llvm.hexagon.S2.ct0
+  __builtin_HEXAGON_S2_ct0p(0);
+  // CHECK: @llvm.hexagon.S2.ct0p
+  __builtin_HEXAGON_S2_ct1(0);
+  // CHECK: @llvm.hexagon.S2.ct1
+  __builtin_HEXAGON_S2_ct1p(0);
+  // CHECK: @llvm.hexagon.S2.ct1p
+  __builtin_HEXAGON_S2_deinterleave(0);
+  // CHECK: @llvm.hexagon.S2.deinterleave
+  __builtin_HEXAGON_S2_extractu(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.extractu
+  __builtin_HEXAGON_S2_extractup(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.extractup
+  __builtin_HEXAGON_S2_extractup_rp(0, 0);
+  // CHECK: @llvm.hexagon.S2.extractup.rp
+  __builtin_HEXAGON_S2_extractu_rp(0, 0);
+  // CHECK: @llvm.hexagon.S2.extractu.rp
+  __builtin_HEXAGON_S2_insert(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insert
+  __builtin_HEXAGON_S2_insertp(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insertp
+  __builtin_HEXAGON_S2_insertp_rp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insertp.rp
+  __builtin_HEXAGON_S2_insert_rp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.insert.rp
+  __builtin_HEXAGON_S2_interleave(0);
+  // CHECK: @llvm.hexagon.S2.interleave
+  __builtin_HEXAGON_S2_lfsp(0, 0);
+  // CHECK: @llvm.hexagon.S2.lfsp
+  __builtin_HEXAGON_S2_lsl_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p
+  __builtin_HEXAGON_S2_lsl_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.acc
+  __builtin_HEXAGON_S2_lsl_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.and
+  __builtin_HEXAGON_S2_lsl_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.nac
+  __builtin_HEXAGON_S2_lsl_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.or
+  __builtin_HEXAGON_S2_lsl_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.p.xor
+  __builtin_HEXAGON_S2_lsl_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r
+  __builtin_HEXAGON_S2_lsl_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.acc
+  __builtin_HEXAGON_S2_lsl_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.and
+  __builtin_HEXAGON_S2_lsl_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.nac
+  __builtin_HEXAGON_S2_lsl_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.r.or
+  __builtin_HEXAGON_S2_lsl_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.vh
+  __builtin_HEXAGON_S2_lsl_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsl.r.vw
+  __builtin_HEXAGON_S2_lsr_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p
+  __builtin_HEXAGON_S2_lsr_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.acc
+  __builtin_HEXAGON_S2_lsr_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.and
+  __builtin_HEXAGON_S2_lsr_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.nac
+  __builtin_HEXAGON_S2_lsr_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.or
+  __builtin_HEXAGON_S2_lsr_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.p.xacc
+  __builtin_HEXAGON_S2_lsr_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r
+  __builtin_HEXAGON_S2_lsr_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.acc
+  __builtin_HEXAGON_S2_lsr_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.and
+  __builtin_HEXAGON_S2_lsr_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.nac
+  __builtin_HEXAGON_S2_lsr_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.or
+  __builtin_HEXAGON_S2_lsr_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.r.xacc
+  __builtin_HEXAGON_S2_lsr_i_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.vh
+  __builtin_HEXAGON_S2_lsr_i_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.i.vw
+  __builtin_HEXAGON_S2_lsr_r_p(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p
+  __builtin_HEXAGON_S2_lsr_r_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.acc
+  __builtin_HEXAGON_S2_lsr_r_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.and
+  __builtin_HEXAGON_S2_lsr_r_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.nac
+  __builtin_HEXAGON_S2_lsr_r_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.or
+  __builtin_HEXAGON_S2_lsr_r_p_xor(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.p.xor
+  __builtin_HEXAGON_S2_lsr_r_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r
+  __builtin_HEXAGON_S2_lsr_r_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.acc
+  __builtin_HEXAGON_S2_lsr_r_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.and
+  __builtin_HEXAGON_S2_lsr_r_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.nac
+  __builtin_HEXAGON_S2_lsr_r_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.r.or
+  __builtin_HEXAGON_S2_lsr_r_vh(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.vh
+  __builtin_HEXAGON_S2_lsr_r_vw(0, 0);
+  // CHECK: @llvm.hexagon.S2.lsr.r.vw
+  __builtin_HEXAGON_S2_packhl(0, 0);
+  // CHECK: @llvm.hexagon.S2.packhl
+  __builtin_HEXAGON_S2_parityp(0, 0);
+  // CHECK: @llvm.hexagon.S2.parityp
+  __builtin_HEXAGON_S2_setbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.setbit.i
+  __builtin_HEXAGON_S2_setbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.setbit.r
+  __builtin_HEXAGON_S2_shuffeb(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffeb
+  __builtin_HEXAGON_S2_shuffeh(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffeh
+  __builtin_HEXAGON_S2_shuffob(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffob
+  __builtin_HEXAGON_S2_shuffoh(0, 0);
+  // CHECK: @llvm.hexagon.S2.shuffoh
+  __builtin_HEXAGON_S2_svsathb(0);
+  // CHECK: @llvm.hexagon.S2.svsathb
+  __builtin_HEXAGON_S2_svsathub(0);
+  // CHECK: @llvm.hexagon.S2.svsathub
+  __builtin_HEXAGON_S2_tableidxb_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxb.goodsyntax
+  __builtin_HEXAGON_S2_tableidxd_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxd.goodsyntax
+  __builtin_HEXAGON_S2_tableidxh_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxh.goodsyntax
+  __builtin_HEXAGON_S2_tableidxw_goodsyntax(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.tableidxw.goodsyntax
+  __builtin_HEXAGON_S2_togglebit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.togglebit.i
+  __builtin_HEXAGON_S2_togglebit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.togglebit.r
+  __builtin_HEXAGON_S2_tstbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S2.tstbit.i
+  __builtin_HEXAGON_S2_tstbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S2.tstbit.r
+  __builtin_HEXAGON_S2_valignib(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.valignib
+  __builtin_HEXAGON_S2_valignrb(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.valignrb
+  __builtin_HEXAGON_S2_vcnegh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vcnegh
+  __builtin_HEXAGON_S2_vcrotate(0, 0);
+  // CHECK: @llvm.hexagon.S2.vcrotate
+  __builtin_HEXAGON_S2_vrcnegh(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vrcnegh
+  __builtin_HEXAGON_S2_vrndpackwh(0);
+  // CHECK: @llvm.hexagon.S2.vrndpackwh
+  __builtin_HEXAGON_S2_vrndpackwhs(0);
+  // CHECK: @llvm.hexagon.S2.vrndpackwhs
+  __builtin_HEXAGON_S2_vsathb(0);
+  // CHECK: @llvm.hexagon.S2.vsathb
+  __builtin_HEXAGON_S2_vsathb_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsathb.nopack
+  __builtin_HEXAGON_S2_vsathub(0);
+  // CHECK: @llvm.hexagon.S2.vsathub
+  __builtin_HEXAGON_S2_vsathub_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsathub.nopack
+  __builtin_HEXAGON_S2_vsatwh(0);
+  // CHECK: @llvm.hexagon.S2.vsatwh
+  __builtin_HEXAGON_S2_vsatwh_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsatwh.nopack
+  __builtin_HEXAGON_S2_vsatwuh(0);
+  // CHECK: @llvm.hexagon.S2.vsatwuh
+  __builtin_HEXAGON_S2_vsatwuh_nopack(0);
+  // CHECK: @llvm.hexagon.S2.vsatwuh.nopack
+  __builtin_HEXAGON_S2_vsplatrb(0);
+  // CHECK: @llvm.hexagon.S2.vsplatrb
+  __builtin_HEXAGON_S2_vsplatrh(0);
+  // CHECK: @llvm.hexagon.S2.vsplatrh
+  __builtin_HEXAGON_S2_vspliceib(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vspliceib
+  __builtin_HEXAGON_S2_vsplicerb(0, 0, 0);
+  // CHECK: @llvm.hexagon.S2.vsplicerb
+  __builtin_HEXAGON_S2_vsxtbh(0);
+  // CHECK: @llvm.hexagon.S2.vsxtbh
+  __builtin_HEXAGON_S2_vsxthw(0);
+  // CHECK: @llvm.hexagon.S2.vsxthw
+  __builtin_HEXAGON_S2_vtrunehb(0);
+  // CHECK: @llvm.hexagon.S2.vtrunehb
+  __builtin_HEXAGON_S2_vtrunewh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vtrunewh
+  __builtin_HEXAGON_S2_vtrunohb(0);
+  // CHECK: @llvm.hexagon.S2.vtrunohb
+  __builtin_HEXAGON_S2_vtrunowh(0, 0);
+  // CHECK: @llvm.hexagon.S2.vtrunowh
+  __builtin_HEXAGON_S2_vzxtbh(0);
+  // CHECK: @llvm.hexagon.S2.vzxtbh
+  __builtin_HEXAGON_S2_vzxthw(0);
+  // CHECK: @llvm.hexagon.S2.vzxthw
+  __builtin_HEXAGON_S4_addaddi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addaddi
+  __builtin_HEXAGON_S4_addi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addi.asl.ri
+  __builtin_HEXAGON_S4_addi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.addi.lsr.ri
+  __builtin_HEXAGON_S4_andi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.andi.asl.ri
+  __builtin_HEXAGON_S4_andi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.andi.lsr.ri
+  __builtin_HEXAGON_S4_clbaddi(0, 0);
+  // CHECK: @llvm.hexagon.S4.clbaddi
+  __builtin_HEXAGON_S4_clbpaddi(0, 0);
+  // CHECK: @llvm.hexagon.S4.clbpaddi
+  __builtin_HEXAGON_S4_clbpnorm(0);
+  // CHECK: @llvm.hexagon.S4.clbpnorm
+  __builtin_HEXAGON_S4_extract(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.extract
+  __builtin_HEXAGON_S4_extractp(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.extractp
+  __builtin_HEXAGON_S4_extractp_rp(0, 0);
+  // CHECK: @llvm.hexagon.S4.extractp.rp
+  __builtin_HEXAGON_S4_extract_rp(0, 0);
+  // CHECK: @llvm.hexagon.S4.extract.rp
+  __builtin_HEXAGON_S4_lsli(0, 0);
+  // CHECK: @llvm.hexagon.S4.lsli
+  __builtin_HEXAGON_S4_ntstbit_i(0, 0);
+  // CHECK: @llvm.hexagon.S4.ntstbit.i
+  __builtin_HEXAGON_S4_ntstbit_r(0, 0);
+  // CHECK: @llvm.hexagon.S4.ntstbit.r
+  __builtin_HEXAGON_S4_or_andi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.andi
+  __builtin_HEXAGON_S4_or_andix(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.andix
+  __builtin_HEXAGON_S4_ori_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.ori.asl.ri
+  __builtin_HEXAGON_S4_ori_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.ori.lsr.ri
+  __builtin_HEXAGON_S4_or_ori(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.or.ori
+  __builtin_HEXAGON_S4_parity(0, 0);
+  // CHECK: @llvm.hexagon.S4.parity
+  __builtin_HEXAGON_S4_subaddi(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subaddi
+  __builtin_HEXAGON_S4_subi_asl_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subi.asl.ri
+  __builtin_HEXAGON_S4_subi_lsr_ri(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.subi.lsr.ri
+  __builtin_HEXAGON_S4_vrcrotate(0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.vrcrotate
+  __builtin_HEXAGON_S4_vrcrotate_acc(0, 0, 0, 0);
+  // CHECK: @llvm.hexagon.S4.vrcrotate.acc
+  __builtin_HEXAGON_S4_vxaddsubh(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubh
+  __builtin_HEXAGON_S4_vxaddsubhr(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubhr
+  __builtin_HEXAGON_S4_vxaddsubw(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxaddsubw
+  __builtin_HEXAGON_S4_vxsubaddh(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddh
+  __builtin_HEXAGON_S4_vxsubaddhr(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddhr
+  __builtin_HEXAGON_S4_vxsubaddw(0, 0);
+  // CHECK: @llvm.hexagon.S4.vxsubaddw
+  __builtin_HEXAGON_S5_asrhub_rnd_sat_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S5.asrhub.rnd.sat.goodsyntax
+  __builtin_HEXAGON_S5_asrhub_sat(0, 0);
+  // CHECK: @llvm.hexagon.S5.asrhub.sat
+  __builtin_HEXAGON_S5_popcountp(0);
+  // CHECK: @llvm.hexagon.S5.popcountp
+  __builtin_HEXAGON_S5_vasrhrnd_goodsyntax(0, 0);
+  // CHECK: @llvm.hexagon.S5.vasrhrnd.goodsyntax
+  __builtin_HEXAGON_S6_rol_i_p(0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p
+  __builtin_HEXAGON_S6_rol_i_p_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.acc
+  __builtin_HEXAGON_S6_rol_i_p_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.and
+  __builtin_HEXAGON_S6_rol_i_p_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.nac
+  __builtin_HEXAGON_S6_rol_i_p_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.or
+  __builtin_HEXAGON_S6_rol_i_p_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.p.xacc
+  __builtin_HEXAGON_S6_rol_i_r(0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r
+  __builtin_HEXAGON_S6_rol_i_r_acc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.acc
+  __builtin_HEXAGON_S6_rol_i_r_and(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.and
+  __builtin_HEXAGON_S6_rol_i_r_nac(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.nac
+  __builtin_HEXAGON_S6_rol_i_r_or(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.or
+  __builtin_HEXAGON_S6_rol_i_r_xacc(0, 0, 0);
+  // CHECK: @llvm.hexagon.S6.rol.i.r.xacc
+  __builtin_HEXAGON_V6_extractw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.extractw.128B
+  __builtin_HEXAGON_V6_extractw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.extractw
+  __builtin_HEXAGON_V6_hi_128B(v64);
+  // CHECK: @llvm.hexagon.V6.hi.128B
+  __builtin_HEXAGON_V6_hi(v32);
+  // CHECK: @llvm.hexagon.V6.hi
+  __builtin_HEXAGON_V6_lo_128B(v64);
+  // CHECK: @llvm.hexagon.V6.lo.128B
+  __builtin_HEXAGON_V6_lo(v32);
+  // CHECK: @llvm.hexagon.V6.lo
+  __builtin_HEXAGON_V6_lvsplatw(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw
+  __builtin_HEXAGON_V6_lvsplatw_128B(0);
+  // CHECK: @llvm.hexagon.V6.lvsplatw.128B
+  __builtin_HEXAGON_V6_pred_and_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.and.128B
+  __builtin_HEXAGON_V6_pred_and_n_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.and.n.128B
+  __builtin_HEXAGON_V6_pred_and_n(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.and.n
+  __builtin_HEXAGON_V6_pred_and(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.and
+  __builtin_HEXAGON_V6_pred_not_128B(v32);
+  // CHECK: @llvm.hexagon.V6.pred.not.128B
+  __builtin_HEXAGON_V6_pred_not(v16);
+  // CHECK: @llvm.hexagon.V6.pred.not
+  __builtin_HEXAGON_V6_pred_or_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.or.128B
+  __builtin_HEXAGON_V6_pred_or_n_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.or.n.128B
+  __builtin_HEXAGON_V6_pred_or_n(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.or.n
+  __builtin_HEXAGON_V6_pred_or(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.or
+  __builtin_HEXAGON_V6_pred_scalar2(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2
+  __builtin_HEXAGON_V6_pred_scalar2_128B(0);
+  // CHECK: @llvm.hexagon.V6.pred.scalar2.128B
+  __builtin_HEXAGON_V6_pred_xor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.pred.xor.128B
+  __builtin_HEXAGON_V6_pred_xor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.pred.xor
+  __builtin_HEXAGON_V6_vabsdiffh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh.128B
+  __builtin_HEXAGON_V6_vabsdiffh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffh
+  __builtin_HEXAGON_V6_vabsdiffub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub.128B
+  __builtin_HEXAGON_V6_vabsdiffub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffub
+  __builtin_HEXAGON_V6_vabsdiffuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh.128B
+  __builtin_HEXAGON_V6_vabsdiffuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffuh
+  __builtin_HEXAGON_V6_vabsdiffw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw.128B
+  __builtin_HEXAGON_V6_vabsdiffw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vabsdiffw
+  __builtin_HEXAGON_V6_vabsh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsh.128B
+  __builtin_HEXAGON_V6_vabsh_sat_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat.128B
+  __builtin_HEXAGON_V6_vabsh_sat(v16);
+  // CHECK: @llvm.hexagon.V6.vabsh.sat
+  __builtin_HEXAGON_V6_vabsh(v16);
+  // CHECK: @llvm.hexagon.V6.vabsh
+  __builtin_HEXAGON_V6_vabsw_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsw.128B
+  __builtin_HEXAGON_V6_vabsw_sat_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat.128B
+  __builtin_HEXAGON_V6_vabsw_sat(v16);
+  // CHECK: @llvm.hexagon.V6.vabsw.sat
+  __builtin_HEXAGON_V6_vabsw(v16);
+  // CHECK: @llvm.hexagon.V6.vabsw
+  __builtin_HEXAGON_V6_vaddb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddb.128B
+  __builtin_HEXAGON_V6_vaddb_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv.128B
+  __builtin_HEXAGON_V6_vaddb_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddb.dv
+  __builtin_HEXAGON_V6_vaddbnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddbnq.128B
+  __builtin_HEXAGON_V6_vaddbnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddbnq
+  __builtin_HEXAGON_V6_vaddbq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddbq.128B
+  __builtin_HEXAGON_V6_vaddbq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddbq
+  __builtin_HEXAGON_V6_vaddb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddb
+  __builtin_HEXAGON_V6_vaddh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddh.128B
+  __builtin_HEXAGON_V6_vaddh_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv.128B
+  __builtin_HEXAGON_V6_vaddh_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddh.dv
+  __builtin_HEXAGON_V6_vaddhnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhnq.128B
+  __builtin_HEXAGON_V6_vaddhnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhnq
+  __builtin_HEXAGON_V6_vaddhq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhq.128B
+  __builtin_HEXAGON_V6_vaddhq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhq
+  __builtin_HEXAGON_V6_vaddhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.128B
+  __builtin_HEXAGON_V6_vaddhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv.128B
+  __builtin_HEXAGON_V6_vaddhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhsat.dv
+  __builtin_HEXAGON_V6_vaddhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhsat
+  __builtin_HEXAGON_V6_vaddh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddh
+  __builtin_HEXAGON_V6_vaddhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddhw.128B
+  __builtin_HEXAGON_V6_vaddhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddhw
+  __builtin_HEXAGON_V6_vaddubh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubh.128B
+  __builtin_HEXAGON_V6_vaddubh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddubh
+  __builtin_HEXAGON_V6_vaddubsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.128B
+  __builtin_HEXAGON_V6_vaddubsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv.128B
+  __builtin_HEXAGON_V6_vaddubsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddubsat.dv
+  __builtin_HEXAGON_V6_vaddubsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddubsat
+  __builtin_HEXAGON_V6_vadduhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.128B
+  __builtin_HEXAGON_V6_vadduhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv.128B
+  __builtin_HEXAGON_V6_vadduhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhsat.dv
+  __builtin_HEXAGON_V6_vadduhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vadduhsat
+  __builtin_HEXAGON_V6_vadduhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vadduhw.128B
+  __builtin_HEXAGON_V6_vadduhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vadduhw
+  __builtin_HEXAGON_V6_vaddw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddw.128B
+  __builtin_HEXAGON_V6_vaddw_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv.128B
+  __builtin_HEXAGON_V6_vaddw_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddw.dv
+  __builtin_HEXAGON_V6_vaddwnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwnq.128B
+  __builtin_HEXAGON_V6_vaddwnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwnq
+  __builtin_HEXAGON_V6_vaddwq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwq.128B
+  __builtin_HEXAGON_V6_vaddwq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwq
+  __builtin_HEXAGON_V6_vaddwsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.128B
+  __builtin_HEXAGON_V6_vaddwsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv.128B
+  __builtin_HEXAGON_V6_vaddwsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaddwsat.dv
+  __builtin_HEXAGON_V6_vaddwsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddwsat
+  __builtin_HEXAGON_V6_vaddw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaddw
+  __builtin_HEXAGON_V6_valignb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.valignb.128B
+  __builtin_HEXAGON_V6_valignbi_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi.128B
+  __builtin_HEXAGON_V6_valignbi(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.valignbi
+  __builtin_HEXAGON_V6_valignb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.valignb
+  __builtin_HEXAGON_V6_vand_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vand.128B
+  __builtin_HEXAGON_V6_vandqrt_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.128B
+  __builtin_HEXAGON_V6_vandqrt_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc.128B
+  __builtin_HEXAGON_V6_vandqrt_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt.acc
+  __builtin_HEXAGON_V6_vandqrt(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandqrt
+  __builtin_HEXAGON_V6_vand(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vand
+  __builtin_HEXAGON_V6_vandvrt_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.128B
+  __builtin_HEXAGON_V6_vandvrt_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc.128B
+  __builtin_HEXAGON_V6_vandvrt_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt.acc
+  __builtin_HEXAGON_V6_vandvrt(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vandvrt
+  __builtin_HEXAGON_V6_vaslh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh.128B
+  __builtin_HEXAGON_V6_vaslhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaslhv.128B
+  __builtin_HEXAGON_V6_vaslh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslh
+  __builtin_HEXAGON_V6_vaslhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaslhv
+  __builtin_HEXAGON_V6_vaslw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.128B
+  __builtin_HEXAGON_V6_vaslw_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc.128B
+  __builtin_HEXAGON_V6_vaslw_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw.acc
+  __builtin_HEXAGON_V6_vaslwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vaslwv.128B
+  __builtin_HEXAGON_V6_vaslw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vaslw
+  __builtin_HEXAGON_V6_vaslwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vaslwv
+  __builtin_HEXAGON_V6_vasrh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh.128B
+  __builtin_HEXAGON_V6_vasrhbrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat.128B
+  __builtin_HEXAGON_V6_vasrhbrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhbrndsat
+  __builtin_HEXAGON_V6_vasrhubrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat.128B
+  __builtin_HEXAGON_V6_vasrhubrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubrndsat
+  __builtin_HEXAGON_V6_vasrhubsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat.128B
+  __builtin_HEXAGON_V6_vasrhubsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrhubsat
+  __builtin_HEXAGON_V6_vasrhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vasrhv.128B
+  __builtin_HEXAGON_V6_vasrh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrh
+  __builtin_HEXAGON_V6_vasrhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vasrhv
+  __builtin_HEXAGON_V6_vasrw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.128B
+  __builtin_HEXAGON_V6_vasrw_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc.128B
+  __builtin_HEXAGON_V6_vasrw_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw.acc
+  __builtin_HEXAGON_V6_vasrwh_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh.128B
+  __builtin_HEXAGON_V6_vasrwhrndsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat.128B
+  __builtin_HEXAGON_V6_vasrwhrndsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhrndsat
+  __builtin_HEXAGON_V6_vasrwhsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat.128B
+  __builtin_HEXAGON_V6_vasrwhsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwhsat
+  __builtin_HEXAGON_V6_vasrwh(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwh
+  __builtin_HEXAGON_V6_vasrwuhsat_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat.128B
+  __builtin_HEXAGON_V6_vasrwuhsat(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrwuhsat
+  __builtin_HEXAGON_V6_vasrwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vasrwv.128B
+  __builtin_HEXAGON_V6_vasrw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vasrw
+  __builtin_HEXAGON_V6_vasrwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vasrwv
+  __builtin_HEXAGON_V6_vassign_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vassign.128B
+  __builtin_HEXAGON_V6_vassignp_128B(v64);
+  // CHECK: @llvm.hexagon.V6.vassignp.128B
+  __builtin_HEXAGON_V6_vassignp(v32);
+  // CHECK: @llvm.hexagon.V6.vassignp
+  __builtin_HEXAGON_V6_vassign(v16);
+  // CHECK: @llvm.hexagon.V6.vassign
+  __builtin_HEXAGON_V6_vavgh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgh.128B
+  __builtin_HEXAGON_V6_vavghrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavghrnd.128B
+  __builtin_HEXAGON_V6_vavghrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavghrnd
+  __builtin_HEXAGON_V6_vavgh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgh
+  __builtin_HEXAGON_V6_vavgub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgub.128B
+  __builtin_HEXAGON_V6_vavgubrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd.128B
+  __builtin_HEXAGON_V6_vavgubrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgubrnd
+  __builtin_HEXAGON_V6_vavgub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgub
+  __builtin_HEXAGON_V6_vavguh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavguh.128B
+  __builtin_HEXAGON_V6_vavguhrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd.128B
+  __builtin_HEXAGON_V6_vavguhrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavguhrnd
+  __builtin_HEXAGON_V6_vavguh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavguh
+  __builtin_HEXAGON_V6_vavgw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgw.128B
+  __builtin_HEXAGON_V6_vavgwrnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd.128B
+  __builtin_HEXAGON_V6_vavgwrnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgwrnd
+  __builtin_HEXAGON_V6_vavgw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vavgw
+  __builtin_HEXAGON_V6_vcl0h_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vcl0h.128B
+  __builtin_HEXAGON_V6_vcl0h(v16);
+  // CHECK: @llvm.hexagon.V6.vcl0h
+  __builtin_HEXAGON_V6_vcl0w_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vcl0w.128B
+  __builtin_HEXAGON_V6_vcl0w(v16);
+  // CHECK: @llvm.hexagon.V6.vcl0w
+  __builtin_HEXAGON_V6_vcombine_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vcombine.128B
+  __builtin_HEXAGON_V6_vcombine(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vcombine
+  __builtin_HEXAGON_V6_vd0_128B();
+  // CHECK: @llvm.hexagon.V6.vd0.128B
+  __builtin_HEXAGON_V6_vd0();
+  // CHECK: @llvm.hexagon.V6.vd0
+  __builtin_HEXAGON_V6_vdealb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vdealb.128B
+  __builtin_HEXAGON_V6_vdealb4w_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdealb4w.128B
+  __builtin_HEXAGON_V6_vdealb4w(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdealb4w
+  __builtin_HEXAGON_V6_vdealb(v16);
+  // CHECK: @llvm.hexagon.V6.vdealb
+  __builtin_HEXAGON_V6_vdealh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vdealh.128B
+  __builtin_HEXAGON_V6_vdealh(v16);
+  // CHECK: @llvm.hexagon.V6.vdealh
+  __builtin_HEXAGON_V6_vdealvdd_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdealvdd.128B
+  __builtin_HEXAGON_V6_vdealvdd(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdealvdd
+  __builtin_HEXAGON_V6_vdelta_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdelta.128B
+  __builtin_HEXAGON_V6_vdelta(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdelta
+  __builtin_HEXAGON_V6_vdmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.128B
+  __builtin_HEXAGON_V6_vdmpybus_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.acc
+  __builtin_HEXAGON_V6_vdmpybus_dv_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpybus_dv_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv.acc
+  __builtin_HEXAGON_V6_vdmpybus_dv(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus.dv
+  __builtin_HEXAGON_V6_vdmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpybus
+  __builtin_HEXAGON_V6_vdmpyhb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.128B
+  __builtin_HEXAGON_V6_vdmpyhb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.acc
+  __builtin_HEXAGON_V6_vdmpyhb_dv_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhb_dv_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv.acc
+  __builtin_HEXAGON_V6_vdmpyhb_dv(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb.dv
+  __builtin_HEXAGON_V6_vdmpyhb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhb
+  __builtin_HEXAGON_V6_vdmpyhisat_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_acc_128B(v32, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhisat_acc(v16, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat.acc
+  __builtin_HEXAGON_V6_vdmpyhisat(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhisat
+  __builtin_HEXAGON_V6_vdmpyhsat_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsat_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat.acc
+  __builtin_HEXAGON_V6_vdmpyhsat(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsat
+  __builtin_HEXAGON_V6_vdmpyhsuisat_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B(v32, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsuisat_acc(v16, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat.acc
+  __builtin_HEXAGON_V6_vdmpyhsuisat(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsuisat
+  __builtin_HEXAGON_V6_vdmpyhsusat_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhsusat_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat.acc
+  __builtin_HEXAGON_V6_vdmpyhsusat(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vdmpyhsusat
+  __builtin_HEXAGON_V6_vdmpyhvsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc.128B
+  __builtin_HEXAGON_V6_vdmpyhvsat_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat.acc
+  __builtin_HEXAGON_V6_vdmpyhvsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vdmpyhvsat
+  __builtin_HEXAGON_V6_vdsaduh_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.128B
+  __builtin_HEXAGON_V6_vdsaduh_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc.128B
+  __builtin_HEXAGON_V6_vdsaduh_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh.acc
+  __builtin_HEXAGON_V6_vdsaduh(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vdsaduh
+  __builtin_HEXAGON_V6_veqb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.128B
+  __builtin_HEXAGON_V6_veqb_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.and.128B
+  __builtin_HEXAGON_V6_veqb_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.and
+  __builtin_HEXAGON_V6_veqb_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.or.128B
+  __builtin_HEXAGON_V6_veqb_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.or
+  __builtin_HEXAGON_V6_veqb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb
+  __builtin_HEXAGON_V6_veqb_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqb.xor.128B
+  __builtin_HEXAGON_V6_veqb_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqb.xor
+  __builtin_HEXAGON_V6_veqh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.128B
+  __builtin_HEXAGON_V6_veqh_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.and.128B
+  __builtin_HEXAGON_V6_veqh_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.and
+  __builtin_HEXAGON_V6_veqh_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.or.128B
+  __builtin_HEXAGON_V6_veqh_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.or
+  __builtin_HEXAGON_V6_veqh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh
+  __builtin_HEXAGON_V6_veqh_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqh.xor.128B
+  __builtin_HEXAGON_V6_veqh_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqh.xor
+  __builtin_HEXAGON_V6_veqw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.128B
+  __builtin_HEXAGON_V6_veqw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.and.128B
+  __builtin_HEXAGON_V6_veqw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.and
+  __builtin_HEXAGON_V6_veqw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.or.128B
+  __builtin_HEXAGON_V6_veqw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.or
+  __builtin_HEXAGON_V6_veqw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw
+  __builtin_HEXAGON_V6_veqw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.veqw.xor.128B
+  __builtin_HEXAGON_V6_veqw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.veqw.xor
+  __builtin_HEXAGON_V6_vgtb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.128B
+  __builtin_HEXAGON_V6_vgtb_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.and.128B
+  __builtin_HEXAGON_V6_vgtb_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.and
+  __builtin_HEXAGON_V6_vgtb_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.or.128B
+  __builtin_HEXAGON_V6_vgtb_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.or
+  __builtin_HEXAGON_V6_vgtb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb
+  __builtin_HEXAGON_V6_vgtb_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor.128B
+  __builtin_HEXAGON_V6_vgtb_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtb.xor
+  __builtin_HEXAGON_V6_vgth_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.128B
+  __builtin_HEXAGON_V6_vgth_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.and.128B
+  __builtin_HEXAGON_V6_vgth_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.and
+  __builtin_HEXAGON_V6_vgth_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.or.128B
+  __builtin_HEXAGON_V6_vgth_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.or
+  __builtin_HEXAGON_V6_vgth(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth
+  __builtin_HEXAGON_V6_vgth_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgth.xor.128B
+  __builtin_HEXAGON_V6_vgth_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgth.xor
+  __builtin_HEXAGON_V6_vgtub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.128B
+  __builtin_HEXAGON_V6_vgtub_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.and.128B
+  __builtin_HEXAGON_V6_vgtub_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.and
+  __builtin_HEXAGON_V6_vgtub_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.or.128B
+  __builtin_HEXAGON_V6_vgtub_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.or
+  __builtin_HEXAGON_V6_vgtub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub
+  __builtin_HEXAGON_V6_vgtub_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor.128B
+  __builtin_HEXAGON_V6_vgtub_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtub.xor
+  __builtin_HEXAGON_V6_vgtuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.128B
+  __builtin_HEXAGON_V6_vgtuh_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and.128B
+  __builtin_HEXAGON_V6_vgtuh_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.and
+  __builtin_HEXAGON_V6_vgtuh_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or.128B
+  __builtin_HEXAGON_V6_vgtuh_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.or
+  __builtin_HEXAGON_V6_vgtuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh
+  __builtin_HEXAGON_V6_vgtuh_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor.128B
+  __builtin_HEXAGON_V6_vgtuh_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuh.xor
+  __builtin_HEXAGON_V6_vgtuw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.128B
+  __builtin_HEXAGON_V6_vgtuw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and.128B
+  __builtin_HEXAGON_V6_vgtuw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.and
+  __builtin_HEXAGON_V6_vgtuw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or.128B
+  __builtin_HEXAGON_V6_vgtuw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.or
+  __builtin_HEXAGON_V6_vgtuw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw
+  __builtin_HEXAGON_V6_vgtuw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor.128B
+  __builtin_HEXAGON_V6_vgtuw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtuw.xor
+  __builtin_HEXAGON_V6_vgtw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.128B
+  __builtin_HEXAGON_V6_vgtw_and_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.and.128B
+  __builtin_HEXAGON_V6_vgtw_and(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.and
+  __builtin_HEXAGON_V6_vgtw_or_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.or.128B
+  __builtin_HEXAGON_V6_vgtw_or(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.or
+  __builtin_HEXAGON_V6_vgtw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw
+  __builtin_HEXAGON_V6_vgtw_xor_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor.128B
+  __builtin_HEXAGON_V6_vgtw_xor(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vgtw.xor
+  __builtin_HEXAGON_V6_vinsertwr_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vinsertwr.128B
+  __builtin_HEXAGON_V6_vinsertwr(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vinsertwr
+  __builtin_HEXAGON_V6_vlalignb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb.128B
+  __builtin_HEXAGON_V6_vlalignbi_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi.128B
+  __builtin_HEXAGON_V6_vlalignbi(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignbi
+  __builtin_HEXAGON_V6_vlalignb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlalignb
+  __builtin_HEXAGON_V6_vlsrh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh.128B
+  __builtin_HEXAGON_V6_vlsrhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vlsrhv.128B
+  __builtin_HEXAGON_V6_vlsrh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrh
+  __builtin_HEXAGON_V6_vlsrhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vlsrhv
+  __builtin_HEXAGON_V6_vlsrw_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrw.128B
+  __builtin_HEXAGON_V6_vlsrwv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vlsrwv.128B
+  __builtin_HEXAGON_V6_vlsrw(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlsrw
+  __builtin_HEXAGON_V6_vlsrwv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vlsrwv
+  __builtin_HEXAGON_V6_vlutb_128B(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.128B
+  __builtin_HEXAGON_V6_vlutb_acc_128B(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.acc.128B
+  __builtin_HEXAGON_V6_vlutb_acc(v16, v16, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.acc
+  __builtin_HEXAGON_V6_vlutb_dv_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.128B
+  __builtin_HEXAGON_V6_vlutb_dv_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.acc.128B
+  __builtin_HEXAGON_V6_vlutb_dv_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv.acc
+  __builtin_HEXAGON_V6_vlutb_dv(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb.dv
+  __builtin_HEXAGON_V6_vlutb(v16, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vlutb
+  __builtin_HEXAGON_V6_vlutvvb_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracc_128B(v32, v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc.128B
+  __builtin_HEXAGON_V6_vlutvvb_oracc(v16, v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb.oracc
+  __builtin_HEXAGON_V6_vlutvvb(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvvb
+  __builtin_HEXAGON_V6_vlutvwh_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracc_128B(v64, v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc.128B
+  __builtin_HEXAGON_V6_vlutvwh_oracc(v32, v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh.oracc
+  __builtin_HEXAGON_V6_vlutvwh(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vlutvwh
+  __builtin_HEXAGON_V6_vmaxh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxh.128B
+  __builtin_HEXAGON_V6_vmaxh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxh
+  __builtin_HEXAGON_V6_vmaxub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxub.128B
+  __builtin_HEXAGON_V6_vmaxub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxub
+  __builtin_HEXAGON_V6_vmaxuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxuh.128B
+  __builtin_HEXAGON_V6_vmaxuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxuh
+  __builtin_HEXAGON_V6_vmaxw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmaxw.128B
+  __builtin_HEXAGON_V6_vmaxw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmaxw
+  __builtin_HEXAGON_V6_vminh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminh.128B
+  __builtin_HEXAGON_V6_vminh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminh
+  __builtin_HEXAGON_V6_vminub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminub.128B
+  __builtin_HEXAGON_V6_vminub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminub
+  __builtin_HEXAGON_V6_vminuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminuh.128B
+  __builtin_HEXAGON_V6_vminuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminuh
+  __builtin_HEXAGON_V6_vminw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vminw.128B
+  __builtin_HEXAGON_V6_vminw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vminw
+  __builtin_HEXAGON_V6_vmpabus_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.128B
+  __builtin_HEXAGON_V6_vmpabus_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc.128B
+  __builtin_HEXAGON_V6_vmpabus_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus.acc
+  __builtin_HEXAGON_V6_vmpabusv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpabusv.128B
+  __builtin_HEXAGON_V6_vmpabus(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpabus
+  __builtin_HEXAGON_V6_vmpabusv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpabusv
+  __builtin_HEXAGON_V6_vmpabuuv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv.128B
+  __builtin_HEXAGON_V6_vmpabuuv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpabuuv
+  __builtin_HEXAGON_V6_vmpahb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.128B
+  __builtin_HEXAGON_V6_vmpahb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc.128B
+  __builtin_HEXAGON_V6_vmpahb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb.acc
+  __builtin_HEXAGON_V6_vmpahb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpahb
+  __builtin_HEXAGON_V6_vmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.128B
+  __builtin_HEXAGON_V6_vmpybus_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc.128B
+  __builtin_HEXAGON_V6_vmpybus_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus.acc
+  __builtin_HEXAGON_V6_vmpybusv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.128B
+  __builtin_HEXAGON_V6_vmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpybus
+  __builtin_HEXAGON_V6_vmpybusv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vmpybusv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybusv.acc
+  __builtin_HEXAGON_V6_vmpybusv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybusv
+  __builtin_HEXAGON_V6_vmpybv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybv.128B
+  __builtin_HEXAGON_V6_vmpybv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc.128B
+  __builtin_HEXAGON_V6_vmpybv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybv.acc
+  __builtin_HEXAGON_V6_vmpybv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpybv
+  __builtin_HEXAGON_V6_vmpyewuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh.128B
+  __builtin_HEXAGON_V6_vmpyewuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyewuh
+  __builtin_HEXAGON_V6_vmpyh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh.128B
+  __builtin_HEXAGON_V6_vmpyhsat_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc.128B
+  __builtin_HEXAGON_V6_vmpyhsat_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsat.acc
+  __builtin_HEXAGON_V6_vmpyhsrs_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs.128B
+  __builtin_HEXAGON_V6_vmpyhsrs(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhsrs
+  __builtin_HEXAGON_V6_vmpyhss_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss.128B
+  __builtin_HEXAGON_V6_vmpyhss(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyhss
+  __builtin_HEXAGON_V6_vmpyhus_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.128B
+  __builtin_HEXAGON_V6_vmpyhus_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc.128B
+  __builtin_HEXAGON_V6_vmpyhus_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhus.acc
+  __builtin_HEXAGON_V6_vmpyhus(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhus
+  __builtin_HEXAGON_V6_vmpyhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.128B
+  __builtin_HEXAGON_V6_vmpyh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyh
+  __builtin_HEXAGON_V6_vmpyhv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyhv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhv.acc
+  __builtin_HEXAGON_V6_vmpyhvsrs_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs.128B
+  __builtin_HEXAGON_V6_vmpyhvsrs(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhvsrs
+  __builtin_HEXAGON_V6_vmpyhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyhv
+  __builtin_HEXAGON_V6_vmpyieoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh.128B
+  __builtin_HEXAGON_V6_vmpyieoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyieoh
+  __builtin_HEXAGON_V6_vmpyiewh_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewh_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewh.acc
+  __builtin_HEXAGON_V6_vmpyiewuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiewuh_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh.acc
+  __builtin_HEXAGON_V6_vmpyiewuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiewuh
+  __builtin_HEXAGON_V6_vmpyih_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyih.128B
+  __builtin_HEXAGON_V6_vmpyih_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc.128B
+  __builtin_HEXAGON_V6_vmpyih_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyih.acc
+  __builtin_HEXAGON_V6_vmpyihb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.128B
+  __builtin_HEXAGON_V6_vmpyihb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc.128B
+  __builtin_HEXAGON_V6_vmpyihb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb.acc
+  __builtin_HEXAGON_V6_vmpyihb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyihb
+  __builtin_HEXAGON_V6_vmpyih(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyih
+  __builtin_HEXAGON_V6_vmpyiowh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh.128B
+  __builtin_HEXAGON_V6_vmpyiowh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyiowh
+  __builtin_HEXAGON_V6_vmpyiwb_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.128B
+  __builtin_HEXAGON_V6_vmpyiwb_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwb_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb.acc
+  __builtin_HEXAGON_V6_vmpyiwb(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwb
+  __builtin_HEXAGON_V6_vmpyiwh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.128B
+  __builtin_HEXAGON_V6_vmpyiwh_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc.128B
+  __builtin_HEXAGON_V6_vmpyiwh_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh.acc
+  __builtin_HEXAGON_V6_vmpyiwh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyiwh
+  __builtin_HEXAGON_V6_vmpyowh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_rnd_sacc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd.sacc
+  __builtin_HEXAGON_V6_vmpyowh_rnd(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.rnd
+  __builtin_HEXAGON_V6_vmpyowh_sacc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc.128B
+  __builtin_HEXAGON_V6_vmpyowh_sacc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh.sacc
+  __builtin_HEXAGON_V6_vmpyowh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyowh
+  __builtin_HEXAGON_V6_vmpyub_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.128B
+  __builtin_HEXAGON_V6_vmpyub_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc.128B
+  __builtin_HEXAGON_V6_vmpyub_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub.acc
+  __builtin_HEXAGON_V6_vmpyubv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.128B
+  __builtin_HEXAGON_V6_vmpyub(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyub
+  __builtin_HEXAGON_V6_vmpyubv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vmpyubv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyubv.acc
+  __builtin_HEXAGON_V6_vmpyubv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyubv
+  __builtin_HEXAGON_V6_vmpyuh_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.128B
+  __builtin_HEXAGON_V6_vmpyuh_acc_128B(v64, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc.128B
+  __builtin_HEXAGON_V6_vmpyuh_acc(v32, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh.acc
+  __builtin_HEXAGON_V6_vmpyuhv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.128B
+  __builtin_HEXAGON_V6_vmpyuh(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vmpyuh
+  __builtin_HEXAGON_V6_vmpyuhv_acc_128B(v64, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc.128B
+  __builtin_HEXAGON_V6_vmpyuhv_acc(v32, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv.acc
+  __builtin_HEXAGON_V6_vmpyuhv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmpyuhv
+  __builtin_HEXAGON_V6_vmux_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vmux.128B
+  __builtin_HEXAGON_V6_vmux(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vmux
+  __builtin_HEXAGON_V6_vnavgh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgh.128B
+  __builtin_HEXAGON_V6_vnavgh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgh
+  __builtin_HEXAGON_V6_vnavgub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgub.128B
+  __builtin_HEXAGON_V6_vnavgub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgub
+  __builtin_HEXAGON_V6_vnavgw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vnavgw.128B
+  __builtin_HEXAGON_V6_vnavgw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vnavgw
+  __builtin_HEXAGON_V6_vnormamth_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnormamth.128B
+  __builtin_HEXAGON_V6_vnormamth(v16);
+  // CHECK: @llvm.hexagon.V6.vnormamth
+  __builtin_HEXAGON_V6_vnormamtw_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnormamtw.128B
+  __builtin_HEXAGON_V6_vnormamtw(v16);
+  // CHECK: @llvm.hexagon.V6.vnormamtw
+  __builtin_HEXAGON_V6_vnot_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vnot.128B
+  __builtin_HEXAGON_V6_vnot(v16);
+  // CHECK: @llvm.hexagon.V6.vnot
+  __builtin_HEXAGON_V6_vor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vor.128B
+  __builtin_HEXAGON_V6_vor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vor
+  __builtin_HEXAGON_V6_vpackeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackeb.128B
+  __builtin_HEXAGON_V6_vpackeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackeb
+  __builtin_HEXAGON_V6_vpackeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackeh.128B
+  __builtin_HEXAGON_V6_vpackeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackeh
+  __builtin_HEXAGON_V6_vpackhb_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat.128B
+  __builtin_HEXAGON_V6_vpackhb_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackhb.sat
+  __builtin_HEXAGON_V6_vpackhub_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat.128B
+  __builtin_HEXAGON_V6_vpackhub_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackhub.sat
+  __builtin_HEXAGON_V6_vpackob_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackob.128B
+  __builtin_HEXAGON_V6_vpackob(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackob
+  __builtin_HEXAGON_V6_vpackoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackoh.128B
+  __builtin_HEXAGON_V6_vpackoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackoh
+  __builtin_HEXAGON_V6_vpackwh_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat.128B
+  __builtin_HEXAGON_V6_vpackwh_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackwh.sat
+  __builtin_HEXAGON_V6_vpackwuh_sat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat.128B
+  __builtin_HEXAGON_V6_vpackwuh_sat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vpackwuh.sat
+  __builtin_HEXAGON_V6_vpopcounth_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vpopcounth.128B
+  __builtin_HEXAGON_V6_vpopcounth(v16);
+  // CHECK: @llvm.hexagon.V6.vpopcounth
+  __builtin_HEXAGON_V6_vrdelta_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrdelta.128B
+  __builtin_HEXAGON_V6_vrdelta(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrdelta
+  __builtin_HEXAGON_V6_vrmpybus_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.128B
+  __builtin_HEXAGON_V6_vrmpybus_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc.128B
+  __builtin_HEXAGON_V6_vrmpybus_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus.acc
+  __builtin_HEXAGON_V6_vrmpybusi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.128B
+  __builtin_HEXAGON_V6_vrmpybusi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi.acc
+  __builtin_HEXAGON_V6_vrmpybusi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybusi
+  __builtin_HEXAGON_V6_vrmpybusv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.128B
+  __builtin_HEXAGON_V6_vrmpybus(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpybus
+  __builtin_HEXAGON_V6_vrmpybusv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybusv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv.acc
+  __builtin_HEXAGON_V6_vrmpybusv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybusv
+  __builtin_HEXAGON_V6_vrmpybv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.128B
+  __builtin_HEXAGON_V6_vrmpybv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc.128B
+  __builtin_HEXAGON_V6_vrmpybv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybv.acc
+  __builtin_HEXAGON_V6_vrmpybv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpybv
+  __builtin_HEXAGON_V6_vrmpyub_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.128B
+  __builtin_HEXAGON_V6_vrmpyub_acc_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc.128B
+  __builtin_HEXAGON_V6_vrmpyub_acc(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub.acc
+  __builtin_HEXAGON_V6_vrmpyubi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.128B
+  __builtin_HEXAGON_V6_vrmpyubi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi.acc
+  __builtin_HEXAGON_V6_vrmpyubi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyubi
+  __builtin_HEXAGON_V6_vrmpyubv_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.128B
+  __builtin_HEXAGON_V6_vrmpyub(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vrmpyub
+  __builtin_HEXAGON_V6_vrmpyubv_acc_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc.128B
+  __builtin_HEXAGON_V6_vrmpyubv_acc(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv.acc
+  __builtin_HEXAGON_V6_vrmpyubv(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vrmpyubv
+  __builtin_HEXAGON_V6_vror_128B(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vror.128B
+  __builtin_HEXAGON_V6_vror(v16, 0);
+  // CHECK: @llvm.hexagon.V6.vror
+  __builtin_HEXAGON_V6_vroundhb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundhb.128B
+  __builtin_HEXAGON_V6_vroundhb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundhb
+  __builtin_HEXAGON_V6_vroundhub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundhub.128B
+  __builtin_HEXAGON_V6_vroundhub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundhub
+  __builtin_HEXAGON_V6_vroundwh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundwh.128B
+  __builtin_HEXAGON_V6_vroundwh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundwh
+  __builtin_HEXAGON_V6_vroundwuh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vroundwuh.128B
+  __builtin_HEXAGON_V6_vroundwuh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vroundwuh
+  __builtin_HEXAGON_V6_vrsadubi_128B(v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.128B
+  __builtin_HEXAGON_V6_vrsadubi_acc_128B(v64, v64, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc.128B
+  __builtin_HEXAGON_V6_vrsadubi_acc(v32, v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi.acc
+  __builtin_HEXAGON_V6_vrsadubi(v32, 0, 0);
+  // CHECK: @llvm.hexagon.V6.vrsadubi
+  __builtin_HEXAGON_V6_vsathub_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsathub.128B
+  __builtin_HEXAGON_V6_vsathub(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsathub
+  __builtin_HEXAGON_V6_vsatwh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsatwh.128B
+  __builtin_HEXAGON_V6_vsatwh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsatwh
+  __builtin_HEXAGON_V6_vsb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vsb.128B
+  __builtin_HEXAGON_V6_vsb(v16);
+  // CHECK: @llvm.hexagon.V6.vsb
+  __builtin_HEXAGON_V6_vsh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vsh.128B
+  __builtin_HEXAGON_V6_vshufeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufeh.128B
+  __builtin_HEXAGON_V6_vshufeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufeh
+  __builtin_HEXAGON_V6_vshuffb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vshuffb.128B
+  __builtin_HEXAGON_V6_vshuffb(v16);
+  // CHECK: @llvm.hexagon.V6.vshuffb
+  __builtin_HEXAGON_V6_vshuffeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshuffeb.128B
+  __builtin_HEXAGON_V6_vshuffeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshuffeb
+  __builtin_HEXAGON_V6_vshuffh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vshuffh.128B
+  __builtin_HEXAGON_V6_vshuffh(v16);
+  // CHECK: @llvm.hexagon.V6.vshuffh
+  __builtin_HEXAGON_V6_vshuffob_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshuffob.128B
+  __builtin_HEXAGON_V6_vshuffob(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshuffob
+  __builtin_HEXAGON_V6_vshuffvdd_128B(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd.128B
+  __builtin_HEXAGON_V6_vshuffvdd(v16, v16, 0);
+  // CHECK: @llvm.hexagon.V6.vshuffvdd
+  __builtin_HEXAGON_V6_vshufoeb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoeb.128B
+  __builtin_HEXAGON_V6_vshufoeb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoeb
+  __builtin_HEXAGON_V6_vshufoeh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoeh.128B
+  __builtin_HEXAGON_V6_vshufoeh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoeh
+  __builtin_HEXAGON_V6_vshufoh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vshufoh.128B
+  __builtin_HEXAGON_V6_vshufoh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vshufoh
+  __builtin_HEXAGON_V6_vsh(v16);
+  // CHECK: @llvm.hexagon.V6.vsh
+  __builtin_HEXAGON_V6_vsubb_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubb.128B
+  __builtin_HEXAGON_V6_vsubb_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv.128B
+  __builtin_HEXAGON_V6_vsubb_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubb.dv
+  __builtin_HEXAGON_V6_vsubbnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubbnq.128B
+  __builtin_HEXAGON_V6_vsubbnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubbnq
+  __builtin_HEXAGON_V6_vsubbq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubbq.128B
+  __builtin_HEXAGON_V6_vsubbq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubbq
+  __builtin_HEXAGON_V6_vsubb(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubb
+  __builtin_HEXAGON_V6_vsubh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubh.128B
+  __builtin_HEXAGON_V6_vsubh_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv.128B
+  __builtin_HEXAGON_V6_vsubh_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubh.dv
+  __builtin_HEXAGON_V6_vsubhnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhnq.128B
+  __builtin_HEXAGON_V6_vsubhnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhnq
+  __builtin_HEXAGON_V6_vsubhq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhq.128B
+  __builtin_HEXAGON_V6_vsubhq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhq
+  __builtin_HEXAGON_V6_vsubhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.128B
+  __builtin_HEXAGON_V6_vsubhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhsat.dv
+  __builtin_HEXAGON_V6_vsubhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhsat
+  __builtin_HEXAGON_V6_vsubh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubh
+  __builtin_HEXAGON_V6_vsubhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubhw.128B
+  __builtin_HEXAGON_V6_vsubhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubhw
+  __builtin_HEXAGON_V6_vsububh_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububh.128B
+  __builtin_HEXAGON_V6_vsububh(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsububh
+  __builtin_HEXAGON_V6_vsububsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububsat.128B
+  __builtin_HEXAGON_V6_vsububsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv.128B
+  __builtin_HEXAGON_V6_vsububsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsububsat.dv
+  __builtin_HEXAGON_V6_vsububsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsububsat
+  __builtin_HEXAGON_V6_vsubuhsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.128B
+  __builtin_HEXAGON_V6_vsubuhsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv.128B
+  __builtin_HEXAGON_V6_vsubuhsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat.dv
+  __builtin_HEXAGON_V6_vsubuhsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubuhsat
+  __builtin_HEXAGON_V6_vsubuhw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubuhw.128B
+  __builtin_HEXAGON_V6_vsubuhw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubuhw
+  __builtin_HEXAGON_V6_vsubw_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubw.128B
+  __builtin_HEXAGON_V6_vsubw_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv.128B
+  __builtin_HEXAGON_V6_vsubw_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubw.dv
+  __builtin_HEXAGON_V6_vsubwnq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwnq.128B
+  __builtin_HEXAGON_V6_vsubwnq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwnq
+  __builtin_HEXAGON_V6_vsubwq_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwq.128B
+  __builtin_HEXAGON_V6_vsubwq(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwq
+  __builtin_HEXAGON_V6_vsubwsat_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.128B
+  __builtin_HEXAGON_V6_vsubwsat_dv_128B(v64, v64);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv.128B
+  __builtin_HEXAGON_V6_vsubwsat_dv(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vsubwsat.dv
+  __builtin_HEXAGON_V6_vsubwsat(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubwsat
+  __builtin_HEXAGON_V6_vsubw(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vsubw
+  __builtin_HEXAGON_V6_vswap_128B(v32, v32, v32);
+  // CHECK: @llvm.hexagon.V6.vswap.128B
+  __builtin_HEXAGON_V6_vswap(v16, v16, v16);
+  // CHECK: @llvm.hexagon.V6.vswap
+  __builtin_HEXAGON_V6_vtmpyb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.128B
+  __builtin_HEXAGON_V6_vtmpyb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb.acc
+  __builtin_HEXAGON_V6_vtmpybus_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.128B
+  __builtin_HEXAGON_V6_vtmpybus_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc.128B
+  __builtin_HEXAGON_V6_vtmpybus_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus.acc
+  __builtin_HEXAGON_V6_vtmpybus(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpybus
+  __builtin_HEXAGON_V6_vtmpyb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyb
+  __builtin_HEXAGON_V6_vtmpyhb_128B(v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.128B
+  __builtin_HEXAGON_V6_vtmpyhb_acc_128B(v64, v64, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc.128B
+  __builtin_HEXAGON_V6_vtmpyhb_acc(v32, v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb.acc
+  __builtin_HEXAGON_V6_vtmpyhb(v32, 0);
+  // CHECK: @llvm.hexagon.V6.vtmpyhb
+  __builtin_HEXAGON_V6_vunpackb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackb.128B
+  __builtin_HEXAGON_V6_vunpackb(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackb
+  __builtin_HEXAGON_V6_vunpackh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackh.128B
+  __builtin_HEXAGON_V6_vunpackh(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackh
+  __builtin_HEXAGON_V6_vunpackob_128B(v64, v32);
+  // CHECK: @llvm.hexagon.V6.vunpackob.128B
+  __builtin_HEXAGON_V6_vunpackob(v32, v16);
+  // CHECK: @llvm.hexagon.V6.vunpackob
+  __builtin_HEXAGON_V6_vunpackoh_128B(v64, v32);
+  // CHECK: @llvm.hexagon.V6.vunpackoh.128B
+  __builtin_HEXAGON_V6_vunpackoh(v32, v16);
+  // CHECK: @llvm.hexagon.V6.vunpackoh
+  __builtin_HEXAGON_V6_vunpackub_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackub.128B
+  __builtin_HEXAGON_V6_vunpackub(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackub
+  __builtin_HEXAGON_V6_vunpackuh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vunpackuh.128B
+  __builtin_HEXAGON_V6_vunpackuh(v16);
+  // CHECK: @llvm.hexagon.V6.vunpackuh
+  __builtin_HEXAGON_V6_vxor_128B(v32, v32);
+  // CHECK: @llvm.hexagon.V6.vxor.128B
+  __builtin_HEXAGON_V6_vxor(v16, v16);
+  // CHECK: @llvm.hexagon.V6.vxor
+  __builtin_HEXAGON_V6_vzb_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vzb.128B
+  __builtin_HEXAGON_V6_vzb(v16);
+  // CHECK: @llvm.hexagon.V6.vzb
+  __builtin_HEXAGON_V6_vzh_128B(v32);
+  // CHECK: @llvm.hexagon.V6.vzh.128B
+  __builtin_HEXAGON_V6_vzh(v16);
+  // CHECK: @llvm.hexagon.V6.vzh
+}
diff --git a/test/CodeGen/builtins-nvptx.c b/test/CodeGen/builtins-nvptx.c
index 745e74f0ca64a..cd21361140bcd 100644
--- a/test/CodeGen/builtins-nvptx.c
+++ b/test/CodeGen/builtins-nvptx.c
@@ -1,6 +1,8 @@
 // REQUIRES: nvptx-registered-target
-// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
+// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device -S -emit-llvm -o - -x cuda %s | \
+// RUN:   FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
 
 #define __device__ __attribute__((device))
 #define __global__ __attribute__((global))
@@ -9,15 +11,15 @@
 
 __device__ int read_tid() {
 
-// CHECK: call i32 @llvm.ptx.read.tid.x()
-// CHECK: call i32 @llvm.ptx.read.tid.y()
-// CHECK: call i32 @llvm.ptx.read.tid.z()
-// CHECK: call i32 @llvm.ptx.read.tid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.w()
 
-  int x = __builtin_ptx_read_tid_x();
-  int y = __builtin_ptx_read_tid_y();
-  int z = __builtin_ptx_read_tid_z();
-  int w = __builtin_ptx_read_tid_w();
+  int x = __nvvm_read_ptx_sreg_tid_x();
+  int y = __nvvm_read_ptx_sreg_tid_y();
+  int z = __nvvm_read_ptx_sreg_tid_z();
+  int w = __nvvm_read_ptx_sreg_tid_w();
 
   return x + y + z + w;
 
@@ -25,15 +27,15 @@ __device__ int read_tid() {
 
 __device__ int read_ntid() {
 
-// CHECK: call i32 @llvm.ptx.read.ntid.x()
-// CHECK: call i32 @llvm.ptx.read.ntid.y()
-// CHECK: call i32 @llvm.ptx.read.ntid.z()
-// CHECK: call i32 @llvm.ptx.read.ntid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.w()
 
-  int x = __builtin_ptx_read_ntid_x();
-  int y = __builtin_ptx_read_ntid_y();
-  int z = __builtin_ptx_read_ntid_z();
-  int w = __builtin_ptx_read_ntid_w();
+  int x = __nvvm_read_ptx_sreg_ntid_x();
+  int y = __nvvm_read_ptx_sreg_ntid_y();
+  int z = __nvvm_read_ptx_sreg_ntid_z();
+  int w = __nvvm_read_ptx_sreg_ntid_w();
 
   return x + y + z + w;
 
@@ -41,15 +43,15 @@ __device__ int read_ntid() {
 
 __device__ int read_ctaid() {
 
-// CHECK: call i32 @llvm.ptx.read.ctaid.x()
-// CHECK: call i32 @llvm.ptx.read.ctaid.y()
-// CHECK: call i32 @llvm.ptx.read.ctaid.z()
-// CHECK: call i32 @llvm.ptx.read.ctaid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.w()
 
-  int x = __builtin_ptx_read_ctaid_x();
-  int y = __builtin_ptx_read_ctaid_y();
-  int z = __builtin_ptx_read_ctaid_z();
-  int w = __builtin_ptx_read_ctaid_w();
+  int x = __nvvm_read_ptx_sreg_ctaid_x();
+  int y = __nvvm_read_ptx_sreg_ctaid_y();
+  int z = __nvvm_read_ptx_sreg_ctaid_z();
+  int w = __nvvm_read_ptx_sreg_ctaid_w();
 
   return x + y + z + w;
 
@@ -57,15 +59,15 @@ __device__ int read_ctaid() {
 
 __device__ int read_nctaid() {
 
-// CHECK: call i32 @llvm.ptx.read.nctaid.x()
-// CHECK: call i32 @llvm.ptx.read.nctaid.y()
-// CHECK: call i32 @llvm.ptx.read.nctaid.z()
-// CHECK: call i32 @llvm.ptx.read.nctaid.w()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.w()
 
-  int x = __builtin_ptx_read_nctaid_x();
-  int y = __builtin_ptx_read_nctaid_y();
-  int z = __builtin_ptx_read_nctaid_z();
-  int w = __builtin_ptx_read_nctaid_w();
+  int x = __nvvm_read_ptx_sreg_nctaid_x();
+  int y = __nvvm_read_ptx_sreg_nctaid_y();
+  int z = __nvvm_read_ptx_sreg_nctaid_z();
+  int w = __nvvm_read_ptx_sreg_nctaid_w();
 
   return x + y + z + w;
 
@@ -73,19 +75,19 @@ __device__ int read_nctaid() {
 
 __device__ int read_ids() {
 
-// CHECK: call i32 @llvm.ptx.read.laneid()
-// CHECK: call i32 @llvm.ptx.read.warpid()
-// CHECK: call i32 @llvm.ptx.read.nwarpid()
-// CHECK: call i32 @llvm.ptx.read.smid()
-// CHECK: call i32 @llvm.ptx.read.nsmid()
-// CHECK: call i32 @llvm.ptx.read.gridid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.laneid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid()
 
-  int a = __builtin_ptx_read_laneid();
-  int b = __builtin_ptx_read_warpid();
-  int c = __builtin_ptx_read_nwarpid();
-  int d = __builtin_ptx_read_smid();
-  int e = __builtin_ptx_read_nsmid();
-  int f = __builtin_ptx_read_gridid();
+  int a = __nvvm_read_ptx_sreg_laneid();
+  int b = __nvvm_read_ptx_sreg_warpid();
+  int c = __nvvm_read_ptx_sreg_nwarpid();
+  int d = __nvvm_read_ptx_sreg_smid();
+  int e = __nvvm_read_ptx_sreg_nsmid();
+  int f = __nvvm_read_ptx_sreg_gridid();
 
   return a + b + c + d + e + f;
 
@@ -93,17 +95,17 @@ __device__ int read_ids() {
 
 __device__ int read_lanemasks() {
 
-// CHECK: call i32 @llvm.ptx.read.lanemask.eq()
-// CHECK: call i32 @llvm.ptx.read.lanemask.le()
-// CHECK: call i32 @llvm.ptx.read.lanemask.lt()
-// CHECK: call i32 @llvm.ptx.read.lanemask.ge()
-// CHECK: call i32 @llvm.ptx.read.lanemask.gt()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt()
 
-  int a = __builtin_ptx_read_lanemask_eq();
-  int b = __builtin_ptx_read_lanemask_le();
-  int c = __builtin_ptx_read_lanemask_lt();
-  int d = __builtin_ptx_read_lanemask_ge();
-  int e = __builtin_ptx_read_lanemask_gt();
+  int a = __nvvm_read_ptx_sreg_lanemask_eq();
+  int b = __nvvm_read_ptx_sreg_lanemask_le();
+  int c = __nvvm_read_ptx_sreg_lanemask_lt();
+  int d = __nvvm_read_ptx_sreg_lanemask_ge();
+  int e = __nvvm_read_ptx_sreg_lanemask_gt();
 
   return a + b + c + d + e;
 
@@ -111,26 +113,26 @@ __device__ int read_lanemasks() {
 
 __device__ long long read_clocks() {
 
-// CHECK: call i32 @llvm.ptx.read.clock()
-// CHECK: call i64 @llvm.ptx.read.clock64()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.clock()
+// CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64()
 
-  int a = __builtin_ptx_read_clock();
-  long long b = __builtin_ptx_read_clock64();
+  int a = __nvvm_read_ptx_sreg_clock();
+  long long b = __nvvm_read_ptx_sreg_clock64();
 
   return a + b;
 }
 
 __device__ int read_pms() {
 
-// CHECK: call i32 @llvm.ptx.read.pm0()
-// CHECK: call i32 @llvm.ptx.read.pm1()
-// CHECK: call i32 @llvm.ptx.read.pm2()
-// CHECK: call i32 @llvm.ptx.read.pm3()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm0()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm1()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm2()
+// CHECK: call i32 @llvm.nvvm.read.ptx.sreg.pm3()
 
-  int a = __builtin_ptx_read_pm0();
-  int b = __builtin_ptx_read_pm1();
-  int c = __builtin_ptx_read_pm2();
-  int d = __builtin_ptx_read_pm3();
+  int a = __nvvm_read_ptx_sreg_pm0();
+  int b = __nvvm_read_ptx_sreg_pm1();
+  int c = __nvvm_read_ptx_sreg_pm2();
+  int d = __nvvm_read_ptx_sreg_pm3();
 
   return a + b + c + d;
 
@@ -138,9 +140,9 @@ __device__ int read_pms() {
 
 __device__ void sync() {
 
-// CHECK: call void @llvm.ptx.bar.sync(i32 0)
+// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
 
-  __builtin_ptx_bar_sync(0);
+  __nvvm_bar_sync(0);
 
 }
 
@@ -177,7 +179,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
 // CHECK: call void @llvm.nvvm.membar.sys()
   __nvvm_membar_sys();
 // CHECK: call void @llvm.nvvm.barrier0()
-  __nvvm_bar0();
+  __syncthreads();
 }
 
 __device__ int di;
@@ -189,7 +191,7 @@ __shared__ long long sll;
 
 // Check for atomic intrinsics
 // CHECK-LABEL: nvvm_atom
-__device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
+__device__ void nvvm_atom(float *fp, float f, int *ip, int i, unsigned int *uip, unsigned ui, long *lp, long l,
                           long long *llp, long long ll) {
   // CHECK: atomicrmw add
   __nvvm_atom_add_gen_i(ip, i);
@@ -272,5 +274,111 @@ __device__ void nvvm_atom(float *fp, float f, int *ip, int i, long *lp, long l,
   // CHECK: call float @llvm.nvvm.atomic.load.add.f32.p0f32
   __nvvm_atom_add_gen_f(fp, f);
 
+  // CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0i32
+  __nvvm_atom_inc_gen_ui(uip, ui);
+
+  // CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0i32
+  __nvvm_atom_dec_gen_ui(uip, ui);
+
   // CHECK: ret
 }
+
+// CHECK-LABEL: nvvm_ldg
+__device__ void nvvm_ldg(const void *p) {
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
+  // CHECK: call i8 @llvm.nvvm.ldg.global.i.i8.p0i8(i8* {{%[0-9]+}}, i32 1)
+  __nvvm_ldg_c((const char *)p);
+  __nvvm_ldg_uc((const unsigned char *)p);
+
+  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
+  // CHECK: call i16 @llvm.nvvm.ldg.global.i.i16.p0i16(i16* {{%[0-9]+}}, i32 2)
+  __nvvm_ldg_s((const short *)p);
+  __nvvm_ldg_us((const unsigned short *)p);
+
+  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // CHECK: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  __nvvm_ldg_i((const int *)p);
+  __nvvm_ldg_ui((const unsigned int *)p);
+
+  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // LP32: call i32 @llvm.nvvm.ldg.global.i.i32.p0i32(i32* {{%[0-9]+}}, i32 4)
+  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
+  // LP64: call i64 @llvm.nvvm.ldg.global.i.i64.p0i64(i64* {{%[0-9]+}}, i32 8)
+  __nvvm_ldg_l((const long *)p);
+  __nvvm_ldg_ul((const unsigned long *)p);
+
+  // CHECK: call float @llvm.nvvm.ldg.global.f.f32.p0f32(float* {{%[0-9]+}}, i32 4)
+  __nvvm_ldg_f((const float *)p);
+  // CHECK: call double @llvm.nvvm.ldg.global.f.f64.p0f64(double* {{%[0-9]+}}, i32 8)
+  __nvvm_ldg_d((const double *)p);
+
+  // In practice, the pointers we pass to __ldg will be aligned as appropriate
+  // for the CUDA <type>N vector types (e.g. short4), which are not the same as
+  // the LLVM vector types.  However, each LLVM vector type has an alignment
+  // less than or equal to its corresponding CUDA type, so we're OK.
+  //
+  // PTX Interoperability section 2.2: "For a vector with an even number of
+  // elements, its alignment is set to number of elements times the alignment of
+  // its member: n*alignof(t)."
+
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
+  // CHECK: call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0v2i8(<2 x i8>* {{%[0-9]+}}, i32 2)
+  typedef char char2 __attribute__((ext_vector_type(2)));
+  typedef unsigned char uchar2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_c2((const char2 *)p);
+  __nvvm_ldg_uc2((const uchar2 *)p);
+
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
+  // CHECK: call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0v4i8(<4 x i8>* {{%[0-9]+}}, i32 4)
+  typedef char char4 __attribute__((ext_vector_type(4)));
+  typedef unsigned char uchar4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_c4((const char4 *)p);
+  __nvvm_ldg_uc4((const uchar4 *)p);
+
+  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
+  // CHECK: call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0v2i16(<2 x i16>* {{%[0-9]+}}, i32 4)
+  typedef short short2 __attribute__((ext_vector_type(2)));
+  typedef unsigned short ushort2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_s2((const short2 *)p);
+  __nvvm_ldg_us2((const ushort2 *)p);
+
+  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
+  // CHECK: call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0v4i16(<4 x i16>* {{%[0-9]+}}, i32 8)
+  typedef short short4 __attribute__((ext_vector_type(4)));
+  typedef unsigned short ushort4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_s4((const short4 *)p);
+  __nvvm_ldg_us4((const ushort4 *)p);
+
+  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
+  // CHECK: call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0v2i32(<2 x i32>* {{%[0-9]+}}, i32 8)
+  typedef int int2 __attribute__((ext_vector_type(2)));
+  typedef unsigned int uint2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_i2((const int2 *)p);
+  __nvvm_ldg_ui2((const uint2 *)p);
+
+  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
+  // CHECK: call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0v4i32(<4 x i32>* {{%[0-9]+}}, i32 16)
+  typedef int int4 __attribute__((ext_vector_type(4)));
+  typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_i4((const int4 *)p);
+  __nvvm_ldg_ui4((const uint4 *)p);
+
+  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
+  // CHECK: call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0v2i64(<2 x i64>* {{%[0-9]+}}, i32 16)
+  typedef long long longlong2 __attribute__((ext_vector_type(2)));
+  typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_ll2((const longlong2 *)p);
+  __nvvm_ldg_ull2((const ulonglong2 *)p);
+
+  // CHECK: call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0v2f32(<2 x float>* {{%[0-9]+}}, i32 8)
+  typedef float float2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_f2((const float2 *)p);
+
+  // CHECK: call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0v4f32(<4 x float>* {{%[0-9]+}}, i32 16)
+  typedef float float4 __attribute__((ext_vector_type(4)));
+  __nvvm_ldg_f4((const float4 *)p);
+
+  // CHECK: call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0v2f64(<2 x double>* {{%[0-9]+}}, i32 16)
+  typedef double double2 __attribute__((ext_vector_type(2)));
+  __nvvm_ldg_d2((const double2 *)p);
+}
diff --git a/test/CodeGen/builtins-ppc-altivec.c b/test/CodeGen/builtins-ppc-altivec.c
index 9539d6ca7afcd..1edf99f8681cf 100644
--- a/test/CodeGen/builtins-ppc-altivec.c
+++ b/test/CodeGen/builtins-ppc-altivec.c
@@ -1,7 +1,16 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -faltivec -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -faltivec -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -faltivec -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-LE
+// RUN: %clang_cc1 -faltivec -triple powerpc-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s
+// RUN: %clang_cc1 -faltivec -triple powerpc64-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s
+// RUN: %clang_cc1 -faltivec -triple powerpc64le-unknown-unknown -emit-llvm %s \
+// RUN:            -o - | FileCheck %s -check-prefix=CHECK-LE
+// RUN: not %clang_cc1 -triple powerpc64le-unknown-unknown -emit-llvm %s \
+// RUN:            -ferror-limit 0 -DNO_ALTIVEC -o - 2>&1 \
+// RUN:            | FileCheck %s -check-prefix=CHECK-NOALTIVEC
+#ifndef NO_ALTIVEC
+#include <altivec.h>
+#endif
 
 vector bool char vbc = { 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 };
 vector signed char vsc = { 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16 };
@@ -27,6 +36,8 @@ vector int res_vi;
 vector unsigned int res_vui;
 vector float res_vf;
 
+// CHECK-NOALTIVEC: error: unknown type name 'vector'
+
 signed char param_sc;
 unsigned char param_uc;
 short param_s;
@@ -66,8 +77,16 @@ void test1() {
 // CHECK-LE: @llvm.ppc.altivec.vmaxsw
 
   vf = vec_abs(vf);
-// CHECK: and <4 x i32>
-// CHECK-LE: and <4 x i32>
+// CHECK: bitcast <4 x float> %{{.*}} to <4 x i32>
+// CHECK: and <4 x i32> {{.*}}, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+// CHECK: bitcast <4 x i32> %{{.*}} to <4 x float>
+// CHECK: store <4 x float> %{{.*}}, <4 x float>* @vf
+// CHECK-LE: bitcast <4 x float> %{{.*}} to <4 x i32>
+// CHECK-LE: and <4 x i32> {{.*}}, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+// CHECK-LE: bitcast <4 x i32> %{{.*}} to <4 x float>
+// CHECK-LE: store <4 x float> %{{.*}}, <4 x float>* @vf
+// CHECK-NOALTIVEC: error: use of undeclared identifier 'vf'
+// CHECK-NOALTIVEC: vf = vec_abs(vf) 
 
   /* vec_abs */
   vsc = vec_abss(vsc);
diff --git a/test/CodeGen/builtins-ppc-p8vector.c b/test/CodeGen/builtins-ppc-p8vector.c
index 29503f0134be3..096e3e1bb6b54 100644
--- a/test/CodeGen/builtins-ppc-p8vector.c
+++ b/test/CodeGen/builtins-ppc-p8vector.c
@@ -6,6 +6,7 @@
 // generate the correct errors for functions that are only overloaded with VSX
 // (vec_cmpge, vec_cmple). Without this option, there is only one overload so
 // it is selected.
+#include <altivec.h>
 
 void dummy() { }
 signed int si;
@@ -73,10 +74,10 @@ void test1() {
 // CHECK-PPC: error: call to 'vec_abs' is ambiguous
 
   res_vd = vec_abs(vda);
-// CHECK: store <2 x i64> <i64 9223372036854775807, i64 9223372036854775807>, <2 x i64>*
-// CHECK: and <2 x i64>
-// CHECK-LE: store <2 x i64> <i64 9223372036854775807, i64 9223372036854775807>, <2 x i64>*
-// CHECK-LE: and <2 x i64>
+// CHECK: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+// CHECK: store <2 x double> %{{.*}}, <2 x double>* @res_vd
+// CHECK-LE: call <2 x double> @llvm.fabs.v2f64(<2 x double> %{{.*}})
+// CHECK-LE: store <2 x double> %{{.*}}, <2 x double>* @res_vd
 // CHECK-PPC: error: call to 'vec_abs' is ambiguous
 
   /* vec_add */
diff --git a/test/CodeGen/builtins-ppc-quadword.c b/test/CodeGen/builtins-ppc-quadword.c
index e17b6791d5f6d..f381642c422f6 100644
--- a/test/CodeGen/builtins-ppc-quadword.c
+++ b/test/CodeGen/builtins-ppc-quadword.c
@@ -8,6 +8,7 @@
 
 // RUN: not %clang_cc1 -faltivec -triple powerpc-unknown-unknown \
 // RUN: -emit-llvm %s -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PPC
+#include <altivec.h>
 
 // CHECK-PPC: error: __int128 is not supported on this target
 vector signed __int128 vlll = { -1 };
diff --git a/test/CodeGen/builtins-ppc-vsx.c b/test/CodeGen/builtins-ppc-vsx.c
index 15f98b57a5139..e58afdd94daba 100644
--- a/test/CodeGen/builtins-ppc-vsx.c
+++ b/test/CodeGen/builtins-ppc-vsx.c
@@ -1,27 +1,63 @@
 // REQUIRES: powerpc-registered-target
 // RUN: %clang_cc1 -faltivec -target-feature +vsx -triple powerpc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -faltivec -target-feature +vsx -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-LE
+#include <altivec.h>
 
+vector bool char vbc = { 0, 1, 0, 1, 0, 1, 0, 1,
+                         0, 1, 0, 1, 0, 1, 0, 1 };
+vector signed char vsc = { -8,  9, -10, 11, -12, 13, -14, 15,
+                           -0,  1,  -2,  3,  -4,  5,  -6,  7};
 vector unsigned char vuc = { 8,  9, 10, 11, 12, 13, 14, 15,
                              0,  1,  2,  3,  4,  5,  6,  7};
 vector float vf = { -1.5, 2.5, -3.5, 4.5 };
 vector double vd = { 3.5, -7.5 };
+vector bool short vbs = { 0, 1, 0, 1, 0, 1, 0, 1 };
+vector signed short vss = { -1, 2, -3, 4, -5, 6, -7, 8 };
+vector unsigned short vus = { 0, 1, 2, 3, 4, 5, 6, 7 };
+vector bool int vbi = { 0, 1, 0, 1 };
 vector signed int vsi = { -1, 2, -3, 4 };
 vector unsigned int vui = { 0, 1, 2, 3 };
 vector bool long long vbll = { 1, 0 };
 vector signed long long vsll = { 255LL, -937LL };
 vector unsigned long long vull = { 1447LL, 2894LL };
 double d = 23.4;
+float af[4] = {23.4f, 56.7f, 89.0f, 12.3f};
+double ad[2] = {23.4, 56.7};
+signed char asc[16] = { -8,  9, -10, 11, -12, 13, -14, 15,
+                        -0,  1,  -2,  3,  -4,  5,  -6,  7};
+unsigned char auc[16] = { 8,  9, 10, 11, 12, 13, 14, 15,
+                          0,  1,  2,  3,  4,  5,  6,  7};
+signed short ass[8] = { -1, 2, -3, 4, -5, 6, -7, 8 };
+unsigned short aus[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+signed int asi[4] = { -1, 2, -3, 4 };
+unsigned int aui[4] = { 0, 1, 2, 3 };
+signed long asl[2] = { -1L, 2L };
+unsigned long aul[2] = { 1L, 2L };
 
 vector float res_vf;
 vector double res_vd;
+vector bool char res_vbc;
+vector signed char res_vsc;
+vector unsigned char res_vuc;
+vector bool short res_vbs;
+vector signed short res_vss;
+vector unsigned short res_vus;
+vector bool int res_vbi;
 vector signed int res_vsi;
 vector unsigned int res_vui;
-vector bool int res_vbi;
 vector bool long long res_vbll;
 vector signed long long res_vsll;
 vector unsigned long long res_vull;
+
 double res_d;
+float res_af[4];
+double res_ad[2];
+signed char res_asc[16];
+unsigned char res_auc[16];
+signed short res_ass[8];
+unsigned short res_aus[8];
+signed int res_asi[4];
+unsigned int res_aui[4];
 
 void dummy() { }
 
@@ -29,6 +65,14 @@ void test1() {
 // CHECK-LABEL: define void @test1
 // CHECK-LE-LABEL: define void @test1
 
+  res_vf = vec_abs(vf);
+// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
+// CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
+
+  dummy();
+// CHECK: call void @dummy()
+// CHECK-LE: call void @dummy()
+
   res_vd = vec_add(vd, vd);
 // CHECK: fadd <2 x double>
 // CHECK-LE: fadd <2 x double>
@@ -292,18 +336,34 @@ void test1() {
 
   /* vec_vsx_ld */
 
+  res_vbi = vec_vsx_ld(0, &vbi);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vsi = vec_vsx_ld(0, &vsi);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vsi = vec_vsx_ld(0, asi);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vui = vec_vsx_ld(0, &vui);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vui = vec_vsx_ld(0, aui);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vf = vec_vsx_ld (0, &vf);
 // CHECK: @llvm.ppc.vsx.lxvw4x
 // CHECK-LE: @llvm.ppc.vsx.lxvw4x
 
+  res_vf = vec_vsx_ld (0, af);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   res_vsll = vec_vsx_ld(0, &vsll);
 // CHECK: @llvm.ppc.vsx.lxvd2x
 // CHECK-LE: @llvm.ppc.vsx.lxvd2x
@@ -316,20 +376,88 @@ void test1() {
 // CHECK: @llvm.ppc.vsx.lxvd2x
 // CHECK-LE: @llvm.ppc.vsx.lxvd2x
 
+  res_vd = vec_vsx_ld(0, ad);
+// CHECK: @llvm.ppc.vsx.lxvd2x
+// CHECK-LE: @llvm.ppc.vsx.lxvd2x
+
+  res_vbs = vec_vsx_ld(0, &vbs);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vss = vec_vsx_ld(0, &vss);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vss = vec_vsx_ld(0, ass);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vus = vec_vsx_ld(0, &vus);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vus = vec_vsx_ld(0, aus);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vbc = vec_vsx_ld(0, &vbc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vsc = vec_vsx_ld(0, &vsc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vuc = vec_vsx_ld(0, &vuc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vsc = vec_vsx_ld(0, asc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
+  res_vuc = vec_vsx_ld(0, auc);
+// CHECK: @llvm.ppc.vsx.lxvw4x
+// CHECK-LE: @llvm.ppc.vsx.lxvw4x
+
   /* vec_vsx_st */
 
+  vec_vsx_st(vbi, 0, &res_vbi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbi, 0, res_aui);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbi, 0, res_asi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vsi, 0, &res_vsi);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vsi, 0, res_asi);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vui, 0, &res_vui);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vui, 0, res_aui);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vf, 0, &res_vf);
 // CHECK: @llvm.ppc.vsx.stxvw4x
 // CHECK-LE: @llvm.ppc.vsx.stxvw4x
 
+  vec_vsx_st(vf, 0, res_af);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   vec_vsx_st(vsll, 0, &res_vsll);
 // CHECK: @llvm.ppc.vsx.stxvd2x
 // CHECK-LE: @llvm.ppc.vsx.stxvd2x
@@ -342,6 +470,66 @@ void test1() {
 // CHECK: @llvm.ppc.vsx.stxvd2x
 // CHECK-LE: @llvm.ppc.vsx.stxvd2x
 
+  vec_vsx_st(vd, 0, res_ad);
+// CHECK: @llvm.ppc.vsx.stxvd2x
+// CHECK-LE: @llvm.ppc.vsx.stxvd2x
+
+  vec_vsx_st(vbs, 0, &res_vbs);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbs, 0, res_aus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbs, 0, res_ass);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vss, 0, &res_vss);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vss, 0, res_ass);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vus, 0, &res_vus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vus, 0, res_aus);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vsc, 0, &res_vsc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vsc, 0, res_asc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vuc, 0, &res_vuc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vuc, 0, res_auc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, &res_vbc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, res_asc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
+  vec_vsx_st(vbc, 0, res_auc);
+// CHECK: @llvm.ppc.vsx.stxvw4x
+// CHECK-LE: @llvm.ppc.vsx.stxvw4x
+
   /* vec_and */
   res_vsll = vec_and(vsll, vsll);
 // CHECK: and <2 x i64>
diff --git a/test/CodeGen/builtins-sparc.c b/test/CodeGen/builtins-sparc.c
new file mode 100644
index 0000000000000..92cc7677e21f7
--- /dev/null
+++ b/test/CodeGen/builtins-sparc.c
@@ -0,0 +1,10 @@
+// REQUIRES: sparc-registered-target
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple sparc64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+void test_eh_return_data_regno(void)
+{
+  volatile int res;
+  res = __builtin_eh_return_data_regno(0);  // CHECK: store volatile i32 24
+  res = __builtin_eh_return_data_regno(1);  // CHECK: store volatile i32 25
+}
diff --git a/test/CodeGen/builtins-systemz-error2.c b/test/CodeGen/builtins-systemz-error2.c
new file mode 100644
index 0000000000000..cf8ee6f7d002b
--- /dev/null
+++ b/test/CodeGen/builtins-systemz-error2.c
@@ -0,0 +1,11 @@
+// REQUIRES: systemz-registered-target
+// RUN: %clang_cc1 -triple s390x-ibm-linux -S -emit-llvm %s -verify -o -
+
+typedef __attribute__((vector_size(16))) char v16i8;
+
+v16i8 f0(v16i8 a, v16i8 b) {
+  __builtin_tbegin ((void *)0);         // expected-error {{'__builtin_tbegin' needs target feature transactional-execution}}
+  v16i8 tmp = __builtin_s390_vaq(a, b); // expected-error {{'__builtin_s390_vaq' needs target feature vector}}
+  return tmp;
+}
+
diff --git a/test/CodeGen/builtins-wasm.c b/test/CodeGen/builtins-wasm.c
index 15f2e9dbf3244..135e32976b7dc 100644
--- a/test/CodeGen/builtins-wasm.c
+++ b/test/CodeGen/builtins-wasm.c
@@ -4,9 +4,9 @@
 // RUN:   | FileCheck %s -check-prefix=WEBASSEMBLY64
 
 __SIZE_TYPE__ f1(void) {
-  return __builtin_wasm_memory_size();
-// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.memory.size.i32()
-// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.memory.size.i64()
+  return __builtin_wasm_current_memory();
+// WEBASSEMBLY32: call {{i.*}} @llvm.wasm.current.memory.i32()
+// WEBASSEMBLY64: call {{i.*}} @llvm.wasm.current.memory.i64()
 }
 
 void f2(long delta) {
diff --git a/test/CodeGen/builtins-x86.c b/test/CodeGen/builtins-x86.c
index 83b11a023a256..55e473fa4e4a2 100644
--- a/test/CodeGen/builtins-x86.c
+++ b/test/CodeGen/builtins-x86.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -emit-llvm -o %t %s
-// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -fsyntax-only -o %t %s
+// RUN: %clang_cc1 -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -emit-llvm -o %t %s
+// RUN: %clang_cc1 -DUSE_ALL -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -fsyntax-only -o %t %s
 
 #ifdef USE_ALL
 #define USE_3DNOW
@@ -281,6 +281,9 @@ void f0() {
   (void)__builtin_ia32_xsaves(tmp_vp, tmp_ULLi);
   (void)__builtin_ia32_xsaves64(tmp_vp, tmp_ULLi);
 
+  (void) __builtin_ia32_monitorx(tmp_vp, tmp_Ui, tmp_Ui);
+  (void) __builtin_ia32_mwaitx(tmp_Ui, tmp_Ui, tmp_Ui);
+
   tmp_V4f = __builtin_ia32_cvtpi2ps(tmp_V4f, tmp_V2i);
   tmp_V2i = __builtin_ia32_cvtps2pi(tmp_V4f);
   tmp_i = __builtin_ia32_cvtss2si(tmp_V4f);
@@ -293,12 +296,10 @@ void f0() {
 #endif
   tmp_V2i = __builtin_ia32_cvttps2pi(tmp_V4f);
   (void) __builtin_ia32_maskmovq(tmp_V8c, tmp_V8c, tmp_cp);
-  (void) __builtin_ia32_storeups(tmp_fp, tmp_V4f);
   (void) __builtin_ia32_storehps(tmp_V2ip, tmp_V4f);
   (void) __builtin_ia32_storelps(tmp_V2ip, tmp_V4f);
   tmp_i = __builtin_ia32_movmskps(tmp_V4f);
   tmp_i = __builtin_ia32_pmovmskb(tmp_V8c);
-  (void) __builtin_ia32_movntps(tmp_fp, tmp_V4f);
   (void) __builtin_ia32_movntq(tmp_V1LLip, tmp_V1LLi);
   (void) __builtin_ia32_sfence();
 
@@ -310,19 +311,15 @@ void f0() {
   tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
   tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
   (void) __builtin_ia32_maskmovdqu(tmp_V16c, tmp_V16c, tmp_cp);
-  (void) __builtin_ia32_storeupd(tmp_dp, tmp_V2d);
   tmp_i = __builtin_ia32_movmskpd(tmp_V2d);
   tmp_i = __builtin_ia32_pmovmskb128(tmp_V16c);
   (void) __builtin_ia32_movnti(tmp_ip, tmp_i);
 #ifdef USE_64
   (void) __builtin_ia32_movnti64(tmp_LLip, tmp_LLi);
 #endif
-  (void) __builtin_ia32_movntpd(tmp_dp, tmp_V2d);
-  (void) __builtin_ia32_movntdq(tmp_V2LLip, tmp_V2LLi);
   tmp_V2LLi = __builtin_ia32_psadbw128(tmp_V16c, tmp_V16c);
   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
-  tmp_V2d = __builtin_ia32_cvtdq2pd(tmp_V4i);
   tmp_V4f = __builtin_ia32_cvtdq2ps(tmp_V4i);
   tmp_V2LLi = __builtin_ia32_cvtpd2dq(tmp_V2d);
   tmp_V2i = __builtin_ia32_cvtpd2pi(tmp_V2d);
@@ -335,12 +332,9 @@ void f0() {
   tmp_LLi = __builtin_ia32_cvtsd2si64(tmp_V2d);
 #endif
   tmp_V4i = __builtin_ia32_cvtps2dq(tmp_V4f);
-  tmp_V2d = __builtin_ia32_cvtps2pd(tmp_V4f);
-  tmp_V4i = __builtin_ia32_cvttps2dq(tmp_V4f);
   (void) __builtin_ia32_clflush(tmp_vCp);
   (void) __builtin_ia32_lfence();
   (void) __builtin_ia32_mfence();
-  (void) __builtin_ia32_storedqu(tmp_cp, tmp_V16c);
   tmp_V4s = __builtin_ia32_psllwi(tmp_V4s, tmp_i);
   tmp_V2i = __builtin_ia32_pslldi(tmp_V2i, tmp_i);
   tmp_V1LLi = __builtin_ia32_psllqi(tmp_V1LLi, tmp_i);
@@ -386,14 +380,7 @@ void f0() {
   tmp_V4i = __builtin_ia32_pminsd128(tmp_V4i, tmp_V4i);
   tmp_V4i = __builtin_ia32_pminud128(tmp_V4i, tmp_V4i);
   tmp_V8s = __builtin_ia32_pminuw128(tmp_V8s, tmp_V8s);
-  tmp_V4i = __builtin_ia32_pmovzxbd128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovzxbq128(tmp_V16c);
-  tmp_V8s = __builtin_ia32_pmovzxbw128(tmp_V16c);
-  tmp_V2LLi = __builtin_ia32_pmovzxdq128(tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmovzxwd128(tmp_V8s);
-  tmp_V2LLi = __builtin_ia32_pmovzxwq128(tmp_V8s);
   tmp_V2LLi = __builtin_ia32_pmuldq128(tmp_V4i, tmp_V4i);
-  tmp_V4i = __builtin_ia32_pmulld128(tmp_V4i, tmp_V4i);
   tmp_V4f = __builtin_ia32_roundps(tmp_V4f, imm_i_0_16);
   tmp_V4f = __builtin_ia32_roundss(tmp_V4f, tmp_V4f, imm_i_0_16);
   tmp_V2d = __builtin_ia32_roundsd(tmp_V2d, tmp_V2d, imm_i_0_16);
@@ -420,14 +407,10 @@ void f0() {
   tmp_V8f = __builtin_ia32_dpps256(tmp_V8f, tmp_V8f, 0x7);
   tmp_V4d = __builtin_ia32_cmppd256(tmp_V4d, tmp_V4d, 0);
   tmp_V8f = __builtin_ia32_cmpps256(tmp_V8f, tmp_V8f, 0);
-  tmp_V4d = __builtin_ia32_cvtdq2pd256(tmp_V4i);
   tmp_V8f = __builtin_ia32_cvtdq2ps256(tmp_V8i);
   tmp_V4f = __builtin_ia32_cvtpd2ps256(tmp_V4d);
   tmp_V8i = __builtin_ia32_cvtps2dq256(tmp_V8f);
-  tmp_V4d = __builtin_ia32_cvtps2pd256(tmp_V4f);
-  tmp_V4i = __builtin_ia32_cvttpd2dq256(tmp_V4d);
   tmp_V4i = __builtin_ia32_cvtpd2dq256(tmp_V4d);
-  tmp_V8i = __builtin_ia32_cvttps2dq256(tmp_V8f);
   tmp_V4d = __builtin_ia32_vperm2f128_pd256(tmp_V4d, tmp_V4d, 0x7);
   tmp_V8f = __builtin_ia32_vperm2f128_ps256(tmp_V8f, tmp_V8f, 0x7);
   tmp_V8i = __builtin_ia32_vperm2f128_si256(tmp_V8i, tmp_V8i, 0x7);
@@ -458,13 +441,7 @@ void f0() {
   __builtin_ia32_vzeroupper();
   tmp_V4d = __builtin_ia32_vbroadcastf128_pd256(tmp_V2dCp);
   tmp_V8f = __builtin_ia32_vbroadcastf128_ps256(tmp_V4fCp);
-  __builtin_ia32_storeupd256(tmp_dp, tmp_V4d);
-  __builtin_ia32_storeups256(tmp_fp, tmp_V8f);
-  __builtin_ia32_storedqu256(tmp_cp, tmp_V32c);
   tmp_V32c = __builtin_ia32_lddqu256(tmp_cCp);
-  __builtin_ia32_movntdq256(tmp_V4LLip, tmp_V4LLi);
-  __builtin_ia32_movntpd256(tmp_dp, tmp_V4d);
-  __builtin_ia32_movntps256(tmp_fp, tmp_V8f);
   tmp_V2d = __builtin_ia32_maskloadpd(tmp_V2dCp, tmp_V2LLi);
   tmp_V4f = __builtin_ia32_maskloadps(tmp_V4fCp, tmp_V4i);
   tmp_V4d = __builtin_ia32_maskloadpd256(tmp_V4dCp, tmp_V4LLi);
diff --git a/test/CodeGen/builtins.c b/test/CodeGen/builtins.c
index 39b2c1209f124..405f2199af19a 100644
--- a/test/CodeGen/builtins.c
+++ b/test/CodeGen/builtins.c
@@ -116,6 +116,16 @@ int main() {
   P(bswap16, (N));
   P(bswap32, (N));
   P(bswap64, (N));
+
+  // CHECK: @llvm.bitreverse.i8
+  // CHECK: @llvm.bitreverse.i16
+  // CHECK: @llvm.bitreverse.i32
+  // CHECK: @llvm.bitreverse.i64
+  P(bitreverse8, (N));
+  P(bitreverse16, (N));
+  P(bitreverse32, (N));
+  P(bitreverse64, (N));
+
   // FIXME
   // V(clear_cache, (&N, &N+1));
   V(trap, ());
@@ -207,10 +217,8 @@ void test_float_builtins(float F, double D, long double LD) {
   // CHECK:  select i1 %[[ISINF]], i32 %[[SIGN]], i32 0
 
   res = __builtin_isfinite(F);
-  // CHECK: fcmp oeq float 
   // CHECK: call float @llvm.fabs.f32(float
-  // CHECK: fcmp une float {{.*}}, 0x7FF0000000000000
-  // CHECK: and i1 
+  // CHECK: fcmp one float {{.*}}, 0x7FF0000000000000
 
   res = __builtin_isnormal(F);
   // CHECK: fcmp oeq float
@@ -242,6 +250,105 @@ void test_float_builtin_ops(float F, double D, long double LD) {
   // CHECK: call float @llvm.fabs.f32(float
   // CHECK: call double @llvm.fabs.f64(double
   // CHECK: call x86_fp80 @llvm.fabs.f80(x86_fp80
+
+  resf = __builtin_canonicalizef(F);
+  resd = __builtin_canonicalize(D);
+  resld = __builtin_canonicalizel(LD);
+  // CHECK: call float @llvm.canonicalize.f32(float
+  // CHECK: call double @llvm.canonicalize.f64(double
+  // CHECK: call x86_fp80 @llvm.canonicalize.f80(x86_fp80
+
+  resf = __builtin_fminf(F, F);
+  // CHECK: call float @llvm.minnum.f32
+
+  resd = __builtin_fmin(D, D);
+  // CHECK: call double @llvm.minnum.f64
+
+  resld = __builtin_fminl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.minnum.f80
+
+  resf = __builtin_fmaxf(F, F);
+  // CHECK: call float @llvm.maxnum.f32
+
+  resd = __builtin_fmax(D, D);
+  // CHECK: call double @llvm.maxnum.f64
+
+  resld = __builtin_fmaxl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.maxnum.f80
+
+  resf = __builtin_fabsf(F);
+  // CHECK: call float @llvm.fabs.f32
+
+  resd = __builtin_fabs(D);
+  // CHECK: call double @llvm.fabs.f64
+
+  resld = __builtin_fabsl(LD);
+  // CHECK: call x86_fp80 @llvm.fabs.f80
+
+  resf = __builtin_copysignf(F, F);
+  // CHECK: call float @llvm.copysign.f32
+
+  resd = __builtin_copysign(D, D);
+  // CHECK: call double @llvm.copysign.f64
+
+  resld = __builtin_copysignl(LD, LD);
+  // CHECK: call x86_fp80 @llvm.copysign.f80
+
+
+  resf = __builtin_ceilf(F);
+  // CHECK: call float @llvm.ceil.f32
+
+  resd = __builtin_ceil(D);
+  // CHECK: call double @llvm.ceil.f64
+
+  resld = __builtin_ceill(LD);
+  // CHECK: call x86_fp80 @llvm.ceil.f80
+
+  resf = __builtin_floorf(F);
+  // CHECK: call float @llvm.floor.f32
+
+  resd = __builtin_floor(D);
+  // CHECK: call double @llvm.floor.f64
+
+  resld = __builtin_floorl(LD);
+  // CHECK: call x86_fp80 @llvm.floor.f80
+
+  resf = __builtin_truncf(F);
+  // CHECK: call float @llvm.trunc.f32
+
+  resd = __builtin_trunc(D);
+  // CHECK: call double @llvm.trunc.f64
+
+  resld = __builtin_truncl(LD);
+  // CHECK: call x86_fp80 @llvm.trunc.f80
+
+  resf = __builtin_rintf(F);
+  // CHECK: call float @llvm.rint.f32
+
+  resd = __builtin_rint(D);
+  // CHECK: call double @llvm.rint.f64
+
+  resld = __builtin_rintl(LD);
+  // CHECK: call x86_fp80 @llvm.rint.f80
+
+  resf = __builtin_nearbyintf(F);
+  // CHECK: call float @llvm.nearbyint.f32
+
+  resd = __builtin_nearbyint(D);
+  // CHECK: call double @llvm.nearbyint.f64
+
+  resld = __builtin_nearbyintl(LD);
+  // CHECK: call x86_fp80 @llvm.nearbyint.f80
+
+  resf = __builtin_roundf(F);
+  // CHECK: call float @llvm.round.f32
+
+  resd = __builtin_round(D);
+  // CHECK: call double @llvm.round.f64
+
+  resld = __builtin_roundl(LD);
+  // CHECK: call x86_fp80 @llvm.round.f80
+
 }
 
 // __builtin_longjmp isn't supported on all platforms, so only test it on X86.
diff --git a/test/CodeGen/cfi-check-fail.c b/test/CodeGen/cfi-check-fail.c
new file mode 100644
index 0000000000000..b850193b54acb
--- /dev/null
+++ b/test/CodeGen/cfi-check-fail.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O0 -fsanitize-cfi-cross-dso \
+// RUN:     -fsanitize=cfi-icall,cfi-nvcall,cfi-vcall,cfi-unrelated-cast,cfi-derived-cast \
+// RUN:     -fsanitize-trap=cfi-icall,cfi-nvcall -fsanitize-recover=cfi-vcall,cfi-unrelated-cast \
+// RUN:     -emit-llvm -o - %s | FileCheck %s
+
+void caller(void (*f)()) {
+  f();
+}
+
+// CHECK: define weak_odr hidden void @__cfi_check_fail(i8*, i8*)
+// CHECK: store i8* %0, i8** %[[ALLOCA0:.*]], align 8
+// CHECK: store i8* %1, i8** %[[ALLOCA1:.*]], align 8
+// CHECK: %[[DATA:.*]] = load i8*, i8** %[[ALLOCA0]], align 8
+// CHECK: %[[ADDR:.*]] = load i8*, i8** %[[ALLOCA1]], align 8
+// CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne i8* %[[DATA]], null
+// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]],
+
+// CHECK: [[TRAP]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT0]]:
+// CHECK:   %[[A:.*]] = bitcast i8* %[[DATA]] to { i8, { i8*, i32, i32 }, i8* }*
+// CHECK:   %[[KINDPTR:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 0
+// CHECK:   %[[KIND:.*]] = load i8, i8* %[[KINDPTR]], align 4
+// CHECK:   %[[VTVALID0:.*]] = call i1 @llvm.type.test(i8* %[[ADDR]], metadata !"all-vtables")
+// CHECK:   %[[VTVALID:.*]] = zext i1 %[[VTVALID0]] to i64
+// CHECK:   %[[NOT_0:.*]] = icmp ne i8 %[[KIND]], 0
+// CHECK:   br i1 %[[NOT_0]], label %[[CONT1:.*]], label %[[HANDLE0:.*]], !prof
+
+// CHECK: [[HANDLE0]]:
+// CHECK:   %[[DATA0:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR0:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail(i64 %[[DATA0]], i64 %[[ADDR0]], i64 %[[VTVALID]])
+// CHECK:   br label %[[CONT1]]
+
+// CHECK: [[CONT1]]:
+// CHECK:   %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1
+// CHECK:   br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize
+
+// CHECK: [[HANDLE1]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT2]]:
+// CHECK:   %[[NOT_2:.*]] = icmp ne i8 %[[KIND]], 2
+// CHECK:   br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !prof
+
+// CHECK: [[HANDLE2]]:
+// CHECK:   %[[DATA2:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR2:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail_abort(i64 %[[DATA2]], i64 %[[ADDR2]], i64 %[[VTVALID]])
+// CHECK:   unreachable
+
+// CHECK: [[CONT3]]:
+// CHECK:   %[[NOT_3:.*]] = icmp ne i8 %[[KIND]], 3
+// CHECK:   br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !prof
+
+// CHECK: [[HANDLE3]]:
+// CHECK:   %[[DATA3:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR3:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail(i64 %[[DATA3]], i64 %[[ADDR3]], i64 %[[VTVALID]])
+// CHECK:   br label %[[CONT4]]
+
+// CHECK: [[CONT4]]:
+// CHECK:   %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4
+// CHECK:   br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize
+
+// CHECK: [[HANDLE4]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT5]]:
+// CHECK:   ret void
diff --git a/test/CodeGen/cfi-check-fail2.c b/test/CodeGen/cfi-check-fail2.c
new file mode 100644
index 0000000000000..5340871c2ebec
--- /dev/null
+++ b/test/CodeGen/cfi-check-fail2.c
@@ -0,0 +1,70 @@
+// __cfi_check_fail codegen when not all CFI checkers are enabled.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O0 -fsanitize-cfi-cross-dso \
+// RUN:     -fsanitize=cfi-vcall \
+// RUN:     -emit-llvm -o - %s | FileCheck %s
+
+void caller(void (*f)()) {
+  f();
+}
+
+// CHECK: define weak_odr hidden void @__cfi_check_fail(i8*, i8*)
+// CHECK: store i8* %0, i8** %[[ALLOCA0:.*]], align 8
+// CHECK: store i8* %1, i8** %[[ALLOCA1:.*]], align 8
+// CHECK: %[[DATA:.*]] = load i8*, i8** %[[ALLOCA0]], align 8
+// CHECK: %[[ADDR:.*]] = load i8*, i8** %[[ALLOCA1]], align 8
+// CHECK: %[[ICMP_NOT_NULL:.*]] = icmp ne i8* %[[DATA]], null
+// CHECK: br i1 %[[ICMP_NOT_NULL]], label %[[CONT0:.*]], label %[[TRAP:.*]],
+
+// CHECK: [[TRAP]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT0]]:
+// CHECK:   %[[A:.*]] = bitcast i8* %[[DATA]] to { i8, { i8*, i32, i32 }, i8* }*
+// CHECK:   %[[KINDPTR:.*]] = getelementptr {{.*}} %[[A]], i32 0, i32 0
+// CHECK:   %[[KIND:.*]] = load i8, i8* %[[KINDPTR]], align 4
+// CHECK:   %[[VTVALID0:.*]] = call i1 @llvm.type.test(i8* %[[ADDR]], metadata !"all-vtables")
+// CHECK:   %[[VTVALID:.*]] = zext i1 %[[VTVALID0]] to i64
+// CHECK:   %[[NOT_0:.*]] = icmp ne i8 %[[KIND]], 0
+// CHECK:   br i1 %[[NOT_0]], label %[[CONT1:.*]], label %[[HANDLE0:.*]], !prof
+
+// CHECK: [[HANDLE0]]:
+// CHECK:   %[[DATA0:.*]] = ptrtoint i8* %[[DATA]] to i64,
+// CHECK:   %[[ADDR0:.*]] = ptrtoint i8* %[[ADDR]] to i64,
+// CHECK:   call void @__ubsan_handle_cfi_check_fail_abort(i64 %[[DATA0]], i64 %[[ADDR0]], i64 %[[VTVALID]])
+// CHECK:   unreachable
+
+// CHECK: [[CONT1]]:
+// CHECK:   %[[NOT_1:.*]] = icmp ne i8 %[[KIND]], 1
+// CHECK:   br i1 %[[NOT_1]], label %[[CONT2:.*]], label %[[HANDLE1:.*]], !nosanitize
+
+// CHECK: [[HANDLE1]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT2]]:
+// CHECK:   %[[NOT_2:.*]] = icmp ne i8 %[[KIND]], 2
+// CHECK:   br i1 %[[NOT_2]], label %[[CONT3:.*]], label %[[HANDLE2:.*]], !nosanitize
+
+// CHECK: [[HANDLE2]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT3]]:
+// CHECK:   %[[NOT_3:.*]] = icmp ne i8 %[[KIND]], 3
+// CHECK:   br i1 %[[NOT_3]], label %[[CONT4:.*]], label %[[HANDLE3:.*]], !nosanitize
+
+// CHECK: [[HANDLE3]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT4]]:
+// CHECK:   %[[NOT_4:.*]] = icmp ne i8 %[[KIND]], 4
+// CHECK:   br i1 %[[NOT_4]], label %[[CONT5:.*]], label %[[HANDLE4:.*]], !nosanitize
+
+// CHECK: [[HANDLE4]]:
+// CHECK-NEXT:   call void @llvm.trap()
+// CHECK-NEXT:   unreachable
+
+// CHECK: [[CONT5]]:
+// CHECK:   ret void
diff --git a/test/CodeGen/cfi-icall-cross-dso.c b/test/CodeGen/cfi-icall-cross-dso.c
index 9337b183c2e4e..636a9e4aedb4d 100644
--- a/test/CodeGen/cfi-icall-cross-dso.c
+++ b/test/CodeGen/cfi-icall-cross-dso.c
@@ -1,11 +1,55 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 -fsanitize=cfi-icall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK --check-prefix=CHECK-DIAG \
+// RUN:       --check-prefix=ITANIUM --check-prefix=ITANIUM-DIAG \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -fsanitize-trap=cfi-icall \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK \
+// RUN:       --check-prefix=ITANIUM --check-prefix=ITANIUM-TRAP \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK --check-prefix=CHECK-DIAG \
+// RUN:       --check-prefix=MS --check-prefix=MS-DIAG \
+// RUN:       %s
+
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -O1 \
+// RUN:   -fsanitize=cfi-icall -fsanitize-cfi-cross-dso -fsanitize-trap=cfi-icall \
+// RUN:   -emit-llvm -o - %s | FileCheck \
+// RUN:       --check-prefix=CHECK \
+// RUN:       --check-prefix=MS --check-prefix=MS-TRAP \
+// RUN:       %s
+
+// CHECK-DIAG: @[[SRC:.*]] = private unnamed_addr constant {{.*}}cfi-icall-cross-dso.c\00
+// CHECK-DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [{{.*}} x i8] } { i16 -1, i16 0, [{{.*}} x i8] c"'void ()'\00"
+// CHECK-DIAG: @[[DATA:.*]] = private unnamed_addr global {{.*}}@[[SRC]]{{.*}}@[[TYPE]]
+
+
+// ITANIUM: call i1 @llvm.type.test(i8* %{{.*}}, metadata !"_ZTSFvE"), !nosanitize
+// ITANIUM-DIAG: call void @__cfi_slowpath_diag(i64 6588678392271548388, i8* %{{.*}}, {{.*}}@[[DATA]]{{.*}}) {{.*}}, !nosanitize
+// ITANIUM-TRAP: call void @__cfi_slowpath(i64 6588678392271548388, i8* %{{.*}}) {{.*}}, !nosanitize
+
+// MS: call i1 @llvm.type.test(i8* %{{.*}}, metadata !"?6AX@Z"), !nosanitize
+// MS-DIAG: call void @__cfi_slowpath_diag(i64 4195979634929632483, i8* %{{.*}}, {{.*}}@[[DATA]]{{.*}}) {{.*}}, !nosanitize
+// MS-TRAP: call void @__cfi_slowpath(i64 4195979634929632483, i8* %{{.*}}) {{.*}}, !nosanitize
 
 void caller(void (*f)()) {
   f();
 }
 
+// Check that we emit both string and hash based type entries for static void g(),
+// and don't emit them for the declaration of h().
+
+// CHECK: define internal void @g({{.*}} !type [[TVOID:![0-9]+]] !type [[TVOID_ID:![0-9]+]]
 static void g(void) {}
+
+// CHECK: declare void @h({{[^!]*$}}
 void h(void);
 
 typedef void (*Fn)(void);
@@ -16,34 +60,22 @@ Fn h1() {
   return &h;
 }
 
+// CHECK: define void @bar({{.*}} !type [[TNOPROTO:![0-9]+]] !type [[TNOPROTO_ID:![0-9]+]]
+// ITANIUM: define available_externally void @foo({{[^!]*$}}
+// MS: define linkonce_odr void @foo({{.*}} !type [[TNOPROTO]] !type [[TNOPROTO_ID]]
 inline void foo() {}
 void bar() { foo(); }
 
-// ITANIUM: call i1 @llvm.bitset.test(i8* %{{.*}}, metadata !"_ZTSFvE"), !nosanitize
-// ITANIUM: call void @__cfi_slowpath(i64 6588678392271548388, i8* %{{.*}}) {{.*}}, !nosanitize
-
-// MS: call i1 @llvm.bitset.test(i8* %{{.*}}, metadata !"?6AX@Z"), !nosanitize
-// MS: call void @__cfi_slowpath(i64 4195979634929632483, i8* %{{.*}}) {{.*}}, !nosanitize
-
-// ITANIUM: define available_externally void @foo()
-// MS: define linkonce_odr void @foo()
-
-// Check that we emit both string and hash based bit set entries for static void g(),
-// and don't emit them for the declaration of h().
-
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
-// CHECK: !{!"{{.*}}", void ()* @g, i64 0}
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
-// CHECK: !{i64 {{.*}}, void ()* @g, i64 0}
-// CHECK-NOT: !{!"{{.*}}", void ()* @h, i64 0}
+// CHECK: !{i32 4, !"Cross-DSO CFI", i32 1}
 
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
-// ITANIUM: !{!"_ZTSFvE", void ()* @bar, i64 0}
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
-// ITANIUM: !{i64 6588678392271548388, void ()* @bar, i64 0}
-// ITANIUM-NOT: !{!{{.*}}, void ()* @foo,
+// Check that the type entries are correct.
 
-// MS: !{!"?6AX@Z", void ()* @foo, i64 0}
-// MS: !{i64 4195979634929632483, void ()* @foo, i64 0}
+// ITANIUM: [[TVOID]] = !{i64 0, !"_ZTSFvvE"}
+// ITANIUM: [[TVOID_ID]] = !{i64 0, i64 9080559750644022485}
+// ITANIUM: [[TNOPROTO]] = !{i64 0, !"_ZTSFvE"}
+// ITANIUM: [[TNOPROTO_ID]] = !{i64 0, i64 6588678392271548388}
 
-// CHECK: !{i32 4, !"Cross-DSO CFI", i32 1}
+// MS: [[TVOID]] = !{i64 0, !"?6AXXZ"}
+// MS: [[TVOID_ID]] = !{i64 0, i64 5113650790573562461}
+// MS: [[TNOPROTO]] = !{i64 0, !"?6AX@Z"}
+// MS: [[TNOPROTO_ID]] = !{i64 0, i64 4195979634929632483}
diff --git a/test/CodeGen/cfi-icall.c b/test/CodeGen/cfi-icall.c
index d6cebef49a147..ed34f4f44beb6 100644
--- a/test/CodeGen/cfi-icall.c
+++ b/test/CodeGen/cfi-icall.c
@@ -1,20 +1,24 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=MS %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-icall -fsanitize-trap=cfi-icall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
 
 // Tests that we assign appropriate identifiers to unprototyped functions.
 
+// CHECK: define void @f({{.*}} !type [[TVOID:![0-9]+]]
 void f() {
 }
 
 void xf();
 
+// CHECK: define void @g({{.*}} !type [[TINT:![0-9]+]]
 void g(int b) {
   void (*fp)() = b ? f : xf;
-  // ITANIUM: call i1 @llvm.bitset.test(i8* {{.*}}, metadata !"_ZTSFvE")
+  // ITANIUM: call i1 @llvm.type.test(i8* {{.*}}, metadata !"_ZTSFvE")
   fp();
 }
 
-// ITANIUM-DAG: !{!"_ZTSFvE", void ()* @f, i64 0}
-// ITANIUM-DAG: !{!"_ZTSFvE", void (...)* @xf, i64 0}
-// MS-DAG: !{!"?6AX@Z", void ()* @f, i64 0}
-// MS-DAG: !{!"?6AX@Z", void (...)* @xf, i64 0}
+// CHECK: declare !type [[TVOID:![0-9]+]] void @xf({{.*}}
+
+// ITANIUM-DAG: [[TVOID]] = !{i64 0, !"_ZTSFvE"}
+// ITANIUM-DAG: [[TINT]] = !{i64 0, !"_ZTSFviE"}
+// MS-DAG: [[TVOID]] = !{i64 0, !"?6AX@Z"}
+// MS-DAG: [[TINT]] = !{i64 0, !"?6AXH@Z"}
diff --git a/test/CodeGen/cfstring-windows.c b/test/CodeGen/cfstring-windows.c
new file mode 100644
index 0000000000000..e54c86089078e
--- /dev/null
+++ b/test/CodeGen/cfstring-windows.c
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDECL -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DECL
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DCF_BUILDING_CF -DDEFN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-IN-CF-DEFN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DEXTERN_DLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-EXTERN-DLLIMPORT
+// RUN: %clang_cc1 -Os -triple thumbv7-windows -fdeclspec -DDLLIMPORT -S -emit-llvm %s -o - | FileCheck %s -check-prefix CHECK-CF-DLLIMPORT
+
+#if defined(CF_BUILDING_CF)
+#if defined(DECL)
+extern __declspec(dllexport) long __CFConstantStringClassReference[];
+#elif defined(DEFN)
+__declspec(dllexport) long __CFConstantStringClassReference[32];
+#endif
+#else
+#if defined(EXTERN)
+extern long __CFConstantStringClassReference[];
+#elif defined(EXTERN_DLLIMPORT)
+extern __declspec(dllimport) long __CFConstantStringClassReference[];
+#elif defined(DLLIMPORT)
+__declspec(dllimport) long __CFConstantStringClassReference[];
+#endif
+#endif
+
+typedef struct __CFString *CFStringRef;
+const CFStringRef string = (CFStringRef)__builtin___CFStringMakeConstantString("string");
+
+// CHECK-CF-IN-CF-DECL: @__CFConstantStringClassReference = external dllexport global [0 x i32]
+// CHECK-CF-IN-CF-DEFN: @__CFConstantStringClassReference = common dllexport global [32 x i32]
+// CHECK-CF: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-EXTERN: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-EXTERN-DLLIMPORT: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+// CHECK-CF-DLLIMPORT: @__CFConstantStringClassReference = external dllimport global [0 x i32]
+
diff --git a/test/CodeGen/cfstring.c b/test/CodeGen/cfstring.c
index 97d39b6a65958..f0862b99b43d6 100644
--- a/test/CodeGen/cfstring.c
+++ b/test/CodeGen/cfstring.c
@@ -1,9 +1,11 @@
-// RUN: %clang_cc1 -emit-llvm %s -o %t
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm %s -o %t
 
 // <rdar://problem/10657500>: Check that the backing store of CFStrings are
 // constant with the -fwritable-strings flag.
 //
-// RUN: %clang_cc1 -fwritable-strings -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-macho -fwritable-strings -emit-llvm %s -o - | FileCheck %s
 //
 // CHECK: @.str = private unnamed_addr constant [14 x i8] c"Hello, World!\00", section "__TEXT,__cstring,cstring_literals", align 1
 // CHECK: @.str.1 = private unnamed_addr constant [7 x i8] c"yo joe\00", section "__TEXT,__cstring,cstring_literals", align 1
diff --git a/test/CodeGen/cleanup-destslot-simple.c b/test/CodeGen/cleanup-destslot-simple.c
index a1c5640fcd854..9b9f74eb21840 100644
--- a/test/CodeGen/cleanup-destslot-simple.c
+++ b/test/CodeGen/cleanup-destslot-simple.c
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=LIFETIME
 
 // We shouldn't have markers at -O0 or with msan.
-// RUN: %clang_cc1 -O0 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s --check-prefix=CHECK
-// RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - -fsanitize=memory | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -O0 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - | FileCheck %s
+// RUN: %clang_cc1 -O1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=line-tables-only %s -o - -fsanitize=memory | FileCheck %s
 
 // There is no exception to handle here, lifetime.end is not a destructor,
 // so there is no need have cleanup dest slot related code
diff --git a/test/CodeGen/const-init.c b/test/CodeGen/const-init.c
index 9434f1d1b24b3..3fd231b630ee7 100644
--- a/test/CodeGen/const-init.c
+++ b/test/CodeGen/const-init.c
@@ -84,7 +84,7 @@ struct g13_s0 g13[] = {
    { (long) &g12_tmp }
 };
 
-// CHECK: @g14 = global i8* inttoptr (i64 100 to i8*)
+// CHECK: @g14 = global i8* inttoptr (i32 100 to i8*)
 void *g14 = (void*) 100;
 
 // CHECK: @g15 = global i32 -1
diff --git a/test/CodeGen/convertvector.c b/test/CodeGen/convertvector.c
index 2b23dd96e1b82..a534b85a42394 100644
--- a/test/CodeGen/convertvector.c
+++ b/test/CodeGen/convertvector.c
@@ -9,14 +9,6 @@ typedef unsigned long   vector8ulong   __attribute__((__vector_size__(64)));
 typedef unsigned short  vector8ushort  __attribute__((__vector_size__(16)));
 
 #ifdef __cplusplus
-#define BOOL bool
-#else
-#define BOOL _Bool
-#endif
-
-typedef BOOL vector8bool __attribute__((__ext_vector_type__(8)));
-
-#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -32,13 +24,6 @@ vector8double flt_ext(vector8float x) {
   // CHECK: fpext <8 x float> %{{[^ ]}} to <8 x double>
 }
 
-vector8bool flt_tobool(vector8float x) {
-  return __builtin_convertvector(x, vector8bool);
-  // CHECK-LABEL: @flt_tobool
-  // CHECK-NOT: fptoui <8 x float> %{{[^ ]}} to <8 x i1>
-  // CHECK: fcmp une <8 x float> %{{[^ ]}}, zeroinitializer
-}
-
 vector8long flt_tosi(vector8float x) {
   return __builtin_convertvector(x, vector8long);
   // CHECK-LABEL: @flt_tosi
@@ -69,13 +54,6 @@ vector8long int_sext(vector8short x) {
   // CHECK: sext <8 x i16> %{{[^ ]}} to <8 x i64>
 }
 
-vector8bool int_tobool(vector8short x) {
-  return __builtin_convertvector(x, vector8bool);
-  // CHECK-LABEL: @int_tobool
-  // CHECK-NOT: trunc <8 x i16> %{{[^ ]}} to <8 x i1>
-  // CHECK: icmp ne <8 x i16> %{{[^ ]}}, zeroinitializer
-}
-
 vector8float int_tofp(vector8short x) {
   return __builtin_convertvector(x, vector8float);
   // CHECK-LABEL: @int_tofp
diff --git a/test/CodeGen/debug-info-imported-entity.cpp b/test/CodeGen/debug-info-imported-entity.cpp
new file mode 100644
index 0000000000000..105cc3dc5371a
--- /dev/null
+++ b/test/CodeGen/debug-info-imported-entity.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unk-unk -o - -emit-llvm -debug-info-kind=limited %s | FileCheck %s
+
+namespace std { class A; }
+using std::A; using ::A;
+
+
+// CHECK: [[CompileUnit:![0-9]+]] = distinct !DICompileUnit({{.+}} imports: [[Imports:![0-9]+]])
+// CHECK: [[Imports]] = !{[[ImportedEntity:![0-9]+]]}
+// CHECK: [[ImportedEntity]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CompileUnit]], entity: [[STDA:![0-9]+]], line: 4)
+// CHECK: [[STDA]] = !DICompositeType(tag: DW_TAG_class_type, name: "A",
+
diff --git a/test/CodeGen/debug-info-packed-struct.c b/test/CodeGen/debug-info-packed-struct.c
index 189bbe47e3702..8c1a0d4cfdb30 100644
--- a/test/CodeGen/debug-info-packed-struct.c
+++ b/test/CodeGen/debug-info-packed-struct.c
@@ -21,7 +21,7 @@ struct layout0 {
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs8",
 // CHECK-SAME:     {{.*}}size: 64, align: 64, offset: 64)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l0_ofs16",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 128)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 128, flags: DIFlagBitField, extraData: i64 128)
 
 
 // ---------------------------------------------------------------------
@@ -40,7 +40,7 @@ struct layout1 {
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs1",
 // CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l1_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 // ---------------------------------------------------------------------
@@ -61,7 +61,7 @@ struct layout2 {
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs1",
 // CHECK-SAME:     {{.*}}size: 64, align: 8, offset: 8)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l2_ofs9",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 72, flags: DIFlagBitField, extraData: i64 72)
 
 
 
@@ -83,7 +83,7 @@ struct layout3 {
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs4",
 // CHECK-SAME:     {{.*}}size: 64, align: 32, offset: 32)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "l3_ofs12",
-// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 96)
+// CHECK-SAME:     {{.*}}size: 1, align: 32, offset: 96, flags: DIFlagBitField, extraData: i64 96)
 
 struct layout0 l0;
 struct layout1 l1;
diff --git a/test/CodeGen/debug-info-renderscript-tag.rs b/test/CodeGen/debug-info-renderscript-tag.rs
new file mode 100644
index 0000000000000..ded650d9660b8
--- /dev/null
+++ b/test/CodeGen/debug-info-renderscript-tag.rs
@@ -0,0 +1,3 @@
+// RUN: %clang -emit-llvm -S -g %s -o - | FileCheck %s
+
+// CHECK: !DICompileUnit(language: DW_LANG_GOOGLE_RenderScript{{.*}})
diff --git a/test/CodeGen/debug-info-scope-file.c b/test/CodeGen/debug-info-scope-file.c
index 296ec05826f53..94123bbc4935e 100644
--- a/test/CodeGen/debug-info-scope-file.c
+++ b/test/CodeGen/debug-info-scope-file.c
@@ -6,8 +6,8 @@
 // CHECK: ret void, !dbg [[F1_LINE:![0-9]*]]
 // CHECK: ret void, !dbg [[F2_LINE:![0-9]*]]
 // CHECK: [[F1:![0-9]*]] = distinct !DISubprogram(name: "f1",{{.*}} isDefinition: true
-// CHECK: [[F2:![0-9]*]] = distinct !DISubprogram(name: "f2",{{.*}} isDefinition: true
 // CHECK: [[F1_LINE]] = !DILocation({{.*}}, scope: [[F1]])
+// CHECK: [[F2:![0-9]*]] = distinct !DISubprogram(name: "f2",{{.*}} isDefinition: true
 // CHECK: [[F2_LINE]] = !DILocation({{.*}}, scope: [[F2]])
 
 void f1() {
diff --git a/test/CodeGen/debug-info.c b/test/CodeGen/debug-info.c
index d122e7fe5cc76..f0215cc923293 100644
--- a/test/CodeGen/debug-info.c
+++ b/test/CodeGen/debug-info.c
@@ -7,7 +7,7 @@ void convert(void) {
 
 
 // PR2784
-struct OPAQUE; // CHECK: DW_TAG_structure_type
+struct OPAQUE; // CHECK-DAG: DW_TAG_structure_type, name: "OPAQUE"
 typedef struct OPAQUE *PTR;
 PTR p;
 
@@ -42,19 +42,19 @@ struct foo2 foo2;
 
 
 // Radar 7325611
-// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "barfoo"
+// CHECK-DAG: !DIDerivedType(tag: DW_TAG_typedef, name: "barfoo"
 typedef int barfoo;
 barfoo foo() {
 }
 
-// CHECK: __uint128_t
+// CHECK-DAG: __uint128_t
 __uint128_t foo128 ()
 {
   __uint128_t int128 = 44;
   return int128;
 }
 
-// CHECK: uint64x2_t
+// CHECK-DAG: uint64x2_t
 typedef unsigned long long uint64_t;
 typedef uint64_t uint64x2_t __attribute__((ext_vector_type(2)));
 uint64x2_t extvectbar[4];
diff --git a/test/CodeGen/dependent-lib.c b/test/CodeGen/dependent-lib.c
index b3abc2f5bc412..9cf49c88d77bb 100644
--- a/test/CodeGen/dependent-lib.c
+++ b/test/CodeGen/dependent-lib.c
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple thumbv7-windows -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple i686-pc-win32 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple x86_64-pc-win32 -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s --dependent-lib=msvcrt -triple i686-pc-linux -emit-llvm -o - | FileCheck -check-prefix LINUX %s
diff --git a/test/CodeGen/dllimport.c b/test/CodeGen/dllimport.c
index 0dfecea436484..f70048ebd1fbf 100644
--- a/test/CodeGen/dllimport.c
+++ b/test/CodeGen/dllimport.c
@@ -45,7 +45,8 @@ __declspec(dllimport) extern int GlobalRedecl3;
 USEVAR(GlobalRedecl3)
 
 // Make sure this works even if the decl has been used before it's defined (PR20792).
-// CHECK: @GlobalRedecl4 = common global i32
+// MS: @GlobalRedecl4 = common dllexport global i32
+// GNU: @GlobalRedecl4 = common global i32
 __declspec(dllimport) extern int GlobalRedecl4;
 USEVAR(GlobalRedecl4)
                       int GlobalRedecl4; // dllimport ignored
@@ -111,13 +112,15 @@ __declspec(dllimport) void redecl2(void);
                       void redecl2(void);
 USE(redecl2)
 
-// CHECK-DAG: define void @redecl3()
+// MS: define dllexport void @redecl3()
+// GNU: define void @redecl3()
 __declspec(dllimport) void redecl3(void);
                       void redecl3(void) {} // dllimport ignored
 USE(redecl3)
 
 // Make sure this works even if the decl is used before it's defined (PR20792).
-// CHECK-DAG: define void @redecl4()
+// MS: define dllexport void @redecl4()
+// GNU: define void @redecl4()
 __declspec(dllimport) void redecl4(void);
 USE(redecl4)
                       void redecl4(void) {} // dllimport ignored
diff --git a/test/CodeGen/enable_if.c b/test/CodeGen/enable_if.c
index f863d80c14abf..5e9f904fdd3f4 100644
--- a/test/CodeGen/enable_if.c
+++ b/test/CodeGen/enable_if.c
@@ -80,3 +80,16 @@ void test4() {
   // CHECK: store void (i32)* @_Z3quxUa9enable_ifIXLi1EEXL_Z9TRUEFACTSEEEi
   p = &qux;
 }
+
+// There was a bug where, when enable_if was present, overload resolution
+// wouldn't pay attention to lower-priority attributes.
+// (N.B. `foo` with pass_object_size should always be preferred)
+// CHECK-LABEL: define void @test5
+void test5() {
+  int foo(char *i) __attribute__((enable_if(1, ""), overloadable));
+  int foo(char *i __attribute__((pass_object_size(0))))
+      __attribute__((enable_if(1, ""), overloadable));
+
+  // CHECK: call i32 @_Z3fooUa9enable_ifIXLi1EEEPcU17pass_object_size0
+  foo((void*)0);
+}
diff --git a/test/CodeGen/exceptions-seh-finally.c b/test/CodeGen/exceptions-seh-finally.c
index f0ed223064012..0f2123ba32bb3 100644
--- a/test/CodeGen/exceptions-seh-finally.c
+++ b/test/CodeGen/exceptions-seh-finally.c
@@ -29,6 +29,7 @@ void basic_finally(void) {
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@basic_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs:#[0-9]+]]
 // CHECK: call void @cleanup()
 
 // Mostly check that we don't double emit 'r' which would crash.
@@ -62,6 +63,7 @@ l:
 // CHECK: ret void
 
 // CHECK: define internal void @"\01?fin$0@0@label_in_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: br label %[[l:[^ ]*]]
 //
 // CHECK: [[l]]
@@ -95,6 +97,7 @@ void use_abnormal_termination(void) {
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@use_abnormal_termination@@"({{i8( zeroext)?}} %[[abnormal:abnormal_termination]], i8* %frame_pointer)
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: %[[abnormal_zext:[^ ]*]] = zext i8 %[[abnormal]] to i32
 // CHECK: store i32 %[[abnormal_zext]], i32* @crashed
 // CHECK-NEXT: ret void
@@ -112,6 +115,7 @@ void noreturn_noop_finally() {
 // CHECK: ret void
 
 // CHECK: define internal void @"\01?fin$0@0@noreturn_noop_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
 
@@ -137,6 +141,7 @@ void noreturn_finally() {
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK: define internal void @"\01?fin$0@0@noreturn_finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: call void @abort()
 // CHECK: unreachable
 
@@ -151,6 +156,7 @@ int finally_with_return() {
 // CHECK-NEXT: ret i32 42
 
 // CHECK: define internal void @"\01?fin$0@0@finally_with_return@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK-NOT: br i1
 // CHECK-NOT: br label
 // CHECK: ret void
@@ -181,9 +187,11 @@ int nested___finally___finally() {
 // CHECK-NEXT: cleanupret from %[[pad]] unwind to caller
 
 // CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
 // CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
 
 // FIXME: Our behavior seems suspiciously different.
@@ -226,7 +234,41 @@ int nested___finally___finally_with_eh_edge() {
 // CHECK-NEXT: cleanupret from %[[outerpad]] unwind to caller
 
 // CHECK-LABEL: define internal void @"\01?fin$0@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: ret void
 
 // CHECK-LABEL: define internal void @"\01?fin$1@0@nested___finally___finally_with_eh_edge@@"({{.*}})
+// CHECK-SAME: [[finally_attrs]]
 // CHECK: unreachable
+
+void finally_within_finally() {
+  __try {
+    might_crash();
+  } __finally {
+    __try {
+      might_crash();
+    } __finally {
+    }
+  }
+}
+
+// CHECK-LABEL: define void @finally_within_finally(
+// CHECK: invoke void @might_crash(
+
+// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"(
+// CHECK: call void @"\01?fin$0@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+
+// CHECK-LABEL: define internal void @"\01?fin$0@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-SAME: [[finally_attrs]]
+// CHECK: invoke void @might_crash(
+
+// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"(
+// CHECK: call void @"\01?fin$1@0@finally_within_finally@@"({{.*}}) [ "funclet"(
+
+// CHECK-LABEL: define internal void @"\01?fin$1@0@finally_within_finally@@"({{[^)]*}})
+// CHECK-SAME: [[finally_attrs]]
+
+// Look for the absence of noinline. Enum attributes come first, so check that
+// a string attribute is the first to verify that no enum attributes are
+// present.
+// CHECK: attributes [[finally_attrs]] = { "{{.*}}" }
diff --git a/test/CodeGen/exceptions-seh-leave.c b/test/CodeGen/exceptions-seh-leave.c
index a0b1956d1423d..087fadbcd7ab3 100644
--- a/test/CodeGen/exceptions-seh-leave.c
+++ b/test/CodeGen/exceptions-seh-leave.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - | opt -instnamer -S | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - | opt -instnamer -S | FileCheck %s
 
 void g(void);
 
diff --git a/test/CodeGen/exceptions-seh.c b/test/CodeGen/exceptions-seh.c
index b027bd844b78a..a0a1dbccd1580 100644
--- a/test/CodeGen/exceptions-seh.c
+++ b/test/CodeGen/exceptions-seh.c
@@ -1,10 +1,10 @@
-// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=X64
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=X86
-// RUN: %clang_cc1 %s -triple i686-pc-windows-gnu -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple i686-pc-windows-gnu -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=X86-GNU
-// RUN: %clang_cc1 %s -triple x86_64-pc-windows-gnu -fms-extensions -fnew-ms-eh -emit-llvm -o - \
+// RUN: %clang_cc1 %s -triple x86_64-pc-windows-gnu -fms-extensions -emit-llvm -o - \
 // RUN:         | FileCheck %s --check-prefix=X64-GNU
 
 void try_body(int numerator, int denominator, int *myres) {
diff --git a/test/CodeGen/f16c-builtins.c b/test/CodeGen/f16c-builtins.c
index f9cfa0d8fb3b8..e99c0d00d4b31 100644
--- a/test/CodeGen/f16c-builtins.c
+++ b/test/CodeGen/f16c-builtins.c
@@ -5,26 +5,52 @@
 
 #include <x86intrin.h>
 
+float test_cvtsh_ss(unsigned short a) {
+  // CHECK-LABEL: test_cvtsh_ss
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
+  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _cvtsh_ss(a);
+}
+
+unsigned short test_cvtss_sh(float a) {
+  // CHECK-LABEL: test_cvtss_sh
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
+  // CHECK: extractelement <8 x i16> %{{.*}}, i32 0
+  return _cvtss_sh(a, 0);
+}
+
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: @llvm.x86.vcvtph2ps.128
+  // CHECK: call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %{{.*}})
   return _mm_cvtph_ps(a);
 }
 
 __m256 test_mm256_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm256_cvtph_ps
-  // CHECK: @llvm.x86.vcvtph2ps.256
+  // CHECK: call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %{{.*}})
   return _mm256_cvtph_ps(a);
 }
 
 __m128i test_mm_cvtps_ph(__m128 a) {
   // CHECK-LABEL: test_mm_cvtps_ph
-  // CHECK: @llvm.x86.vcvtps2ph.128
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
   return _mm_cvtps_ph(a, 0);
 }
 
 __m128i test_mm256_cvtps_ph(__m256 a) {
   // CHECK-LABEL: test_mm256_cvtps_ph
-  // CHECK: @llvm.x86.vcvtps2ph.256
+  // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
   return _mm256_cvtps_ph(a, 0);
 }
diff --git a/test/CodeGen/fixup-depth-overflow.c b/test/CodeGen/fixup-depth-overflow.c
new file mode 100644
index 0000000000000..be8f54284ec70
--- /dev/null
+++ b/test/CodeGen/fixup-depth-overflow.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -O1 -disable-llvm-optzns -emit-llvm -o - %s | FileCheck %s
+
+#define M if (x) goto L1;
+#define M10 M M M M M M M M M M
+#define M100 M10 M10 M10 M10 M10 M10 M10 M10 M10 M10
+#define M1000 M100 M100 M100 M100 M100 M100 M100 M100 M100 M100
+
+void f(int x) {
+  int h;
+
+  // Many gotos to not-yet-emitted labels would cause EHScope's FixupDepth
+  // to overflow (PR23490).
+  M1000 M1000 M1000
+
+  if (x == 5) {
+    // This will cause us to emit a clean-up of the stack variable. If the
+    // FixupDepths are broken, fixups will erroneously get threaded through it.
+    int i;
+  }
+
+L1:
+  return;
+}
+
+// CHECK-LABEL: define void @f
+// CHECK-NOT: cleanup
diff --git a/test/CodeGen/forwarding-blocks-if.c b/test/CodeGen/forwarding-blocks-if.c
new file mode 100644
index 0000000000000..51711bf0ef46e
--- /dev/null
+++ b/test/CodeGen/forwarding-blocks-if.c
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// Check that no empty blocks are generated for nested ifs.
+
+extern void func();
+
+int f0(int val) {
+  if (val == 0) {
+    func();
+  } else if (val == 1) {
+    func();
+  }
+  return 0;
+}
+
+// CHECK-LABEL: define {{.*}}i32 @f0
+// CHECK: call void {{.*}} @func
+// CHECK: call void {{.*}} @func
+// CHECK: br label %[[RETBLOCK1:[^ ]*]]
+// CHECK: [[RETBLOCK1]]:
+// CHECK-NOT: br label
+// CHECK: ret i32
+
+int f1(int val, int g) {
+  if (val == 0)
+    if (g == 1) {
+      func();
+    }
+  return 0;
+}
+
+// CHECK-LABEL: define {{.*}}i32 @f1
+// CHECK: call void {{.*}} @func
+// CHECK: br label %[[RETBLOCK2:[^ ]*]]
+// CHECK: [[RETBLOCK2]]:
+// CHECK-NOT: br label
+// CHECK: ret i32
diff --git a/test/CodeGen/fp128_complex.c b/test/CodeGen/fp128_complex.c
index 8775999329518..48659d2241684 100644
--- a/test/CodeGen/fp128_complex.c
+++ b/test/CodeGen/fp128_complex.c
@@ -1,9 +1,9 @@
-// RUN: %clang -target aarch64-linux-gnuabi %s -O3 -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -target aarch64-linux-gnuabi %s -S -emit-llvm -o - | FileCheck %s
 
 _Complex long double a, b, c, d;
 void test_fp128_compound_assign(void) {
-  // CHECK: tail call { fp128, fp128 } @__multc3
+  // CHECK: call { fp128, fp128 } @__multc3
   a *= b;
-  // CHECK: tail call { fp128, fp128 } @__divtc3
+  // CHECK: call { fp128, fp128 } @__divtc3
   c /= d;
 }
diff --git a/test/CodeGen/fp16-ops.c b/test/CodeGen/fp16-ops.c
index 7cd08a03d6420..c96727f3d3fb8 100644
--- a/test/CodeGen/fp16-ops.c
+++ b/test/CodeGen/fp16-ops.c
@@ -7,6 +7,8 @@
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 // RUN: %clang_cc1 -emit-llvm -o - -triple aarch64-none-linux-gnueabi -fnative-half-type %s \
 // RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
+// RUN: %clang_cc1 -emit-llvm -o - -x renderscript %s \
+// RUN:   | FileCheck %s --check-prefix=NATIVE-HALF
 typedef unsigned cond_t;
 
 volatile cond_t test;
diff --git a/test/CodeGen/function-target-features.c b/test/CodeGen/function-target-features.c
index 351c7f102b12b..6b32d3ddbcda6 100644
--- a/test/CodeGen/function-target-features.c
+++ b/test/CodeGen/function-target-features.c
@@ -9,7 +9,7 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7 -target-feature +avx | FileCheck %s -check-prefix=CORE-CPU-AND-FEATURES
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu x86-64 | FileCheck %s -check-prefix=X86-64-CPU
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s -target-cpu corei7-avx -target-feature -avx | FileCheck %s -check-prefix=AVX-MINUS-FEATURE
-// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=NO-SOFT-FLOAT
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 // RUN: %clang_cc1 -triple arm-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 // RUN: %clang_cc1 -triple mips-unknown-unknown -emit-llvm -o - %s -target-feature +soft-float | FileCheck %s -check-prefix=SOFT-FLOAT
 
diff --git a/test/CodeGen/hexagon-inline-asm.c b/test/CodeGen/hexagon-inline-asm.c
new file mode 100644
index 0000000000000..cda3d0dcb6bd3
--- /dev/null
+++ b/test/CodeGen/hexagon-inline-asm.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple hexagon-unknown-elf -target-feature +hvx -emit-llvm -o - %s | FileCheck %s
+
+typedef int v64 __attribute__((__vector_size__(64)))
+    __attribute__((aligned(64)));
+
+int g;
+
+void foo(v64 v0, v64 v1, v64 *p) {
+  int r;
+  v64 q0;
+  asm ("%0 = vgtw(%1.w,%2.w)" : "=q"(q0) : "v"(v0), "v"(v1));
+// CHECK: call <16 x i32> asm "$0 = vgtw($1.w,$2.w)", "=q,v,v"(<16 x i32>{{.*}}, <16 x i32>{{.*}})
+  *p = q0;
+
+  asm ("%0 = memw(##%1)" : "=r"(r) : "s"(&g));
+// CHECK: call i32 asm "$0 = memw(##$1)", "=r,s"(i32* @g)
+}
diff --git a/test/CodeGen/iamcu-abi.c b/test/CodeGen/iamcu-abi.c
new file mode 100644
index 0000000000000..897d475d49723
--- /dev/null
+++ b/test/CodeGen/iamcu-abi.c
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -triple i386-pc-elfiamcu -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: target datalayout = "e-m:e-p:32:32-i64:32-f64:32-f128:32-n8:16:32-a:0:32-S32"
+// CHECK: target triple = "i386-pc-elfiamcu"
+
+
+void food(double *d);
+void fooll(long long *ll);
+void fooull(unsigned long long *ull);
+void foold(long double *ld);
+
+// CHECK-LABEL: define void @testdouble()
+// CHECK: alloca double, align 4
+void testdouble() {
+  double d = 2.0;
+  food(&d);
+}
+
+// CHECK-LABEL: define void @testlonglong()
+// CHECK: alloca i64, align 4
+void testlonglong() {
+  long long ll = 2;
+  fooll(&ll);
+}
+
+// CHECK-LABEL: define void @testunsignedlonglong()
+// CHECK: alloca i64, align 4
+void testunsignedlonglong() {
+  unsigned long long ull = 2;
+  fooull(&ull);	
+}
+
+// CHECK-LABEL: define void @testlongdouble()
+// CHECK: alloca double, align 4
+void testlongdouble() {
+  long double ld = 2.0;
+  foold(&ld);
+}
diff --git a/test/CodeGen/ifunc.c b/test/CodeGen/ifunc.c
new file mode 100644
index 0000000000000..a88bb1878f265
--- /dev/null
+++ b/test/CodeGen/ifunc.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -O2 -emit-llvm -o - %s | FileCheck %s
+
+int foo(int) __attribute__ ((ifunc("foo_ifunc")));
+
+static int f1(int i) {
+  return i + 1;
+}
+
+static int f2(int i) {
+  return i + 2;
+}
+
+typedef int (*foo_t)(int);
+
+int global;
+
+static foo_t foo_ifunc() {
+  return global ? f1 : f2;
+}
+
+int bar() {
+  return foo(1);
+}
+
+extern void goo(void);
+
+void bar2(void) {
+  goo();
+}
+
+extern void goo(void) __attribute__ ((ifunc("goo_ifunc")));
+
+void* goo_ifunc(void) {
+  return 0;
+}
+// CHECK: @foo = ifunc i32 (i32), bitcast (i32 (i32)* ()* @foo_ifunc to i32 (i32)*)
+// CHECK: @goo = ifunc void (), bitcast (i8* ()* @goo_ifunc to void ()*)
+
+// CHECK: call i32 @foo(i32
+// CHECK: call void @goo()
diff --git a/test/CodeGen/init.c b/test/CodeGen/init.c
index a2b492013d490..5d086723cc0e1 100644
--- a/test/CodeGen/init.c
+++ b/test/CodeGen/init.c
@@ -1,5 +1,16 @@
 // RUN: %clang_cc1 -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
 
+struct I { int k[3]; };
+struct M { struct I o[2]; };
+struct M v1[1] = { [0].o[0 ... 1].k[0 ... 1] = 4, 5 };
+unsigned v2[2][3] = {[0 ... 1][0 ... 1] = 2222, 3333};
+
+// CHECK-DAG: %struct.M = type { [2 x %struct.I] }
+// CHECK-DAG: %struct.I = type { [3 x i32] }
+
+// CHECK: [1 x %struct.M] [%struct.M { [2 x %struct.I] [%struct.I { [3 x i32] [i32 4, i32 4, i32 0] }, %struct.I { [3 x i32] [i32 4, i32 4, i32 5] }] }],
+// CHECK: [2 x [3 x i32]] {{[[][[]}}3 x i32] [i32 2222, i32 2222, i32 0], [3 x i32] [i32 2222, i32 2222, i32 3333]],
+
 void f1() {
   // Scalars in braces.
   int a = { 1 };
diff --git a/test/CodeGen/inline-asm-immediate-ubsan.c b/test/CodeGen/inline-asm-immediate-ubsan.c
index 77d5e4f557c08..2b14e92fa9e37 100644
--- a/test/CodeGen/inline-asm-immediate-ubsan.c
+++ b/test/CodeGen/inline-asm-immediate-ubsan.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -fsanitize=signed-integer-overflow \
-// RUN:   | FileCheck %s --check-prefix=CHECK
+// RUN:   | FileCheck %s
 
 // Verify we emit constants for "immediate" inline assembly arguments.
 // Emitting a scalar expression can make the immediate be generated as
diff --git a/test/CodeGen/inline-asm-mixed-style.c b/test/CodeGen/inline-asm-mixed-style.c
new file mode 100644
index 0000000000000..6b830d9fa7a92
--- /dev/null
+++ b/test/CodeGen/inline-asm-mixed-style.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fasm-blocks -fsyntax-only -verify %s -DCHECK_ASM_GOTO
+// RUN: %clang_cc1 -triple i386-unknown-unknown -fasm-blocks -O0 -emit-llvm -S %s -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+
+void f() {
+  __asm mov eax, ebx
+  __asm mov ebx, ecx
+  __asm__("movl %ecx, %edx");
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %ebx
+  // CHECK: movl    %ecx, %edx
+
+  __asm mov eax, ebx
+  __asm volatile ("movl %ecx, %edx");
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %edx
+
+  __asm mov eax, ebx
+  __asm const ("movl %ecx, %edx"); // expected-warning {{ignored const qualifier on asm}} 
+  // CHECK: movl    %ebx, %eax
+  // CHECK: movl    %ecx, %edx
+
+#ifdef CHECK_ASM_GOTO
+  __asm volatile goto ("movl %ecx, %edx"); // expected-error {{'asm goto' constructs are not supported yet}}
+
+  __asm mov eax, ebx
+  __asm goto ("movl %ecx, %edx"); // expected-error {{'asm goto' constructs are not supported yet}}
+#endif
+}
diff --git a/test/CodeGen/inline-optim.c b/test/CodeGen/inline-optim.c
new file mode 100644
index 0000000000000..f8b355afd9c13
--- /dev/null
+++ b/test/CodeGen/inline-optim.c
@@ -0,0 +1,31 @@
+// Make sure -finline-functions family flags are behaving correctly.
+
+// RUN: %clang_cc1 -triple i686-pc-win32 -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -O3 -fno-inline-functions -emit-llvm %s -o - | FileCheck -check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -finline-hint-functions -emit-llvm %s -o - | FileCheck -check-prefix=HINT %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -finline-functions -emit-llvm %s -o - | FileCheck -check-prefix=INLINE %s
+
+inline int inline_hint(int a, int b) { return(a+b); }
+
+int inline_no_hint(int a, int b) { return (a/b); }
+
+inline __attribute__ ((__always_inline__)) int inline_always(int a, int b) { return(a*b); }
+
+volatile int *pa = (int*) 0x1000;
+void foo() {
+// NOINLINE-LABEL: @foo
+// HINT-LABEL: @foo
+// INLINE-LABEL: @foo
+// NOINLINE: call i32 @inline_hint
+// HINT-NOT: call i32 @inline_hint
+// INLINE-NOT: call i32 @inline_hint
+    pa[0] = inline_hint(pa[1],pa[2]);
+// NOINLINE-NOT: call i32 @inline_always
+// HINT-NOT: call i32 @inline_always
+// INLINE-NOT: call i32 @inline_always
+    pa[3] = inline_always(pa[4],pa[5]);
+// NOINLINE: call i32 @inline_no_hint
+// HINT: call i32 @inline_no_hint
+// INLINE-NOT: call i32 @inline_no_hint
+    pa[6] = inline_no_hint(pa[7], pa[8]);
+}
diff --git a/test/CodeGen/instrument-functions.c b/test/CodeGen/instrument-functions.c
index d80385e2239ab..454dc4de5220d 100644
--- a/test/CodeGen/instrument-functions.c
+++ b/test/CodeGen/instrument-functions.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -S -emit-llvm -o - %s -finstrument-functions | FileCheck %s
+// RUN: %clang_cc1 -S -debug-info-kind=standalone -emit-llvm -o - %s -finstrument-functions | FileCheck %s
 
 // CHECK: @test1
 int test1(int x) {
-// CHECK: __cyg_profile_func_enter
-// CHECK: __cyg_profile_func_exit
+// CHECK: call void @__cyg_profile_func_enter({{.*}}, !dbg
+// CHECK: call void @__cyg_profile_func_exit({{.*}}, !dbg
 // CHECK: ret
   return x;
 }
diff --git a/test/CodeGen/lanai-arguments.c b/test/CodeGen/lanai-arguments.c
new file mode 100644
index 0000000000000..9ce4ed98a78ce
--- /dev/null
+++ b/test/CodeGen/lanai-arguments.c
@@ -0,0 +1,75 @@
+// RUN: %clang_cc1 -triple lanai-unknown-unknown %s -emit-llvm -o - \
+// RUN:   | FileCheck %s
+
+// Basic argument/attribute tests for Lanai.
+
+// CHECK: define void @f0(i32 inreg %i, i32 inreg %j, i64 inreg %k)
+void f0(int i, long j, long long k) {}
+
+typedef struct {
+  int aa;
+  int bb;
+} s1;
+// CHECK: define void @f1(i32 inreg %i.coerce0, i32 inreg %i.coerce1)
+void f1(s1 i) {}
+
+typedef struct {
+  int cc;
+} s2;
+// CHECK: define void @f2(%struct.s2* noalias sret %agg.result)
+s2 f2() {
+  s2 foo;
+  return foo;
+}
+
+typedef struct {
+  int cc;
+  int dd;
+} s3;
+// CHECK: define void @f3(%struct.s3* noalias sret %agg.result)
+s3 f3() {
+  s3 foo;
+  return foo;
+}
+
+// CHECK: define void @f4(i64 inreg %i)
+void f4(long long i) {}
+
+// CHECK: define void @f5(i8 inreg %a, i16 inreg %b)
+void f5(char a, short b) {}
+
+// CHECK: define void @f6(i8 inreg %a, i16 inreg %b)
+void f6(unsigned char a, unsigned short b) {}
+
+enum my_enum {
+  ENUM1,
+  ENUM2,
+  ENUM3,
+};
+// Enums should be treated as the underlying i32.
+// CHECK: define void @f7(i32 inreg %a)
+void f7(enum my_enum a) {}
+
+enum my_big_enum {
+  ENUM4 = 0xFFFFFFFFFFFFFFFF,
+};
+// Big enums should be treated as the underlying i64.
+// CHECK: define void @f8(i64 inreg %a)
+void f8(enum my_big_enum a) {}
+
+union simple_union {
+  int a;
+  char b;
+};
+// Unions should be passed inreg.
+// CHECK: define void @f9(i32 inreg %s.coerce)
+void f9(union simple_union s) {}
+
+typedef struct {
+  int b4 : 4;
+  int b3 : 3;
+  int b8 : 8;
+} bitfield1;
+// Bitfields should be passed inreg.
+// CHECK: define void @f10(i32 inreg %bf1.coerce)
+void f10(bitfield1 bf1) {}
diff --git a/test/CodeGen/lanai-regparm.c b/test/CodeGen/lanai-regparm.c
new file mode 100644
index 0000000000000..c315f43bca736
--- /dev/null
+++ b/test/CodeGen/lanai-regparm.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -mregparm 4 %s -emit-llvm -o - | FileCheck %s
+
+void f1(int a, int b, int c, int d,
+        int e, int f, int g, int h);
+
+void f2(int a, int b) __attribute((regparm(0)));
+
+void f0() {
+// CHECK: call void @f1(i32 inreg 1, i32 inreg 2, i32 inreg 3, i32 inreg 4,
+// CHECK: i32 5, i32 6, i32 7, i32 8)
+  f1(1, 2, 3, 4, 5, 6, 7, 8);
+// CHECK: call void @f2(i32 1, i32 2)
+  f2(1, 2);
+}
+
+// CHECK: declare void @f1(i32 inreg, i32 inreg, i32 inreg, i32 inreg,
+// CHECK: i32, i32, i32, i32)
+// CHECK: declare void @f2(i32, i32)
diff --git a/test/CodeGen/le32-vaarg.c b/test/CodeGen/le32-vaarg.c
index 51bbb0296846d..c02af27691f25 100644
--- a/test/CodeGen/le32-vaarg.c
+++ b/test/CodeGen/le32-vaarg.c
@@ -6,7 +6,9 @@ int get_int(va_list *args) {
 }
 // CHECK: define i32 @get_int
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, i32{{$}}
-// CHECK: ret i32 [[RESULT]]
+// CHECK: store i32 [[RESULT]], i32* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = load i32, i32* [[LOC]]
+// CHECK: ret i32 [[RESULT2]]
 
 struct Foo {
   int x;
@@ -19,7 +21,9 @@ void get_struct(va_list *args) {
 }
 // CHECK: define void @get_struct
 // CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo{{$}}
-// CHECK: store %struct.Foo [[RESULT]], %struct.Foo* @dest
+// CHECK: store %struct.Foo [[RESULT]], %struct.Foo* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[LOC2:%[a-z_0-9]+]] = bitcast {{.*}} [[LOC]] to i8*
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[LOC2]]
 
 void skip_struct(va_list *args) {
   va_arg(*args, struct Foo);
diff --git a/test/CodeGen/lifetime-asan.c b/test/CodeGen/lifetime-asan.c
new file mode 100644
index 0000000000000..5f0c66d513de6
--- /dev/null
+++ b/test/CodeGen/lifetime-asan.c
@@ -0,0 +1,21 @@
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - -O0 %s | FileCheck %s -check-prefix=CHECK-O0
+// RUN: %clang -target x86_64-linux-gnu -S -emit-llvm -o - -O0 \
+// RUN:     -fsanitize=address -fsanitize-address-use-after-scope %s | \
+// RUN:     FileCheck %s -check-prefix=CHECK-ASAN-USE-AFTER-SCOPE
+
+extern int bar(char *A, int n);
+
+// CHECK-O0-NOT: @llvm.lifetime.start
+int foo(int n) {
+  if (n) {
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.start(i64 10, i8* {{.*}})
+    char A[10];
+    return bar(A, 1);
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.end(i64 10, i8* {{.*}})
+  } else {
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.start(i64 20, i8* {{.*}})
+    char A[20];
+    return bar(A, 2);
+    // CHECK-ASAN-USE-AFTER-SCOPE: @llvm.lifetime.end(i64 20, i8* {{.*}})
+  }
+}
diff --git a/test/CodeGen/linker-option.c b/test/CodeGen/linker-option.c
new file mode 100644
index 0000000000000..b1b2ec461c89b
--- /dev/null
+++ b/test/CodeGen/linker-option.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 %s --linker-option=/include:foo -triple i686-pc-win32 -emit-llvm -o - | FileCheck %s
+
+// CHECK: !llvm.module.flags = !{{{.*}}}
+// CHECK: !{{[0-9]+}} = !{i32 6, !"Linker Options", ![[link_opts:[0-9]+]]}
+// CHECK: ![[link_opts]] = !{![[msvcrt:[0-9]+]]}
+// CHECK: ![[msvcrt]] = !{!"/include:foo"}
+
+int f();
diff --git a/test/CodeGen/malign-double.cpp b/test/CodeGen/malign-double.cpp
new file mode 100644
index 0000000000000..0cda4dc3687d3
--- /dev/null
+++ b/test/CodeGen/malign-double.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -malign-double -triple i386-unknown-linux -emit-llvm %s -o - \
+// RUN:   | FileCheck --check-prefix=CHECK-ON --check-prefix=CHECK %s
+
+// RUN: %clang_cc1 -triple i386-unknown-linux -emit-llvm %s -o - \
+// RUN:   | FileCheck --check-prefix=CHECK-OFF --check-prefix=CHECK %s
+
+/* Structs S1, S2, S3, S4, and union U5 are taken from Intel, "IA-64
+   Software Conventions and Runtime Architecture Guide", version of
+   August 1999, Section 4.2, Figures 4-1 through 4-5.
+   A Union containing a double was also thrown in for good measure. */
+
+struct S1 {
+  char c;
+};
+
+unsigned S1_align = __alignof(struct S1);
+unsigned S1_size = sizeof(struct S1);
+// CHECK: @S1_align = global i32 1, align 4
+// CHECK: @S1_size = global i32 1, align 4
+
+unsigned S1_c_offset = (unsigned) &((struct S1*) 0)->c;
+// CHECK: @S1_c_offset = global i32 0, align 4
+
+struct S2{
+  char c;
+  char d;
+  short s;
+  int n;
+};
+
+unsigned S2_align = __alignof(struct S2);
+unsigned S2_size = sizeof(struct S2);
+// CHECK: @S2_align = global i32 4, align 4
+// CHECK: @S2_size = global i32 8, align 4
+
+unsigned S2_c_offset = (unsigned) &((struct S2*) 0)->c;
+unsigned S2_d_offset = (unsigned) &((struct S2*) 0)->d;
+unsigned S2_s_offset = (unsigned) &((struct S2*) 0)->s;
+unsigned S2_n_offset = (unsigned) &((struct S2*) 0)->n;
+// CHECK: @S2_c_offset = global i32 0, align 4
+// CHECK: @S2_d_offset = global i32 1, align 4
+// CHECK: @S2_s_offset = global i32 2, align 4
+// CHECK: @S2_n_offset = global i32 4, align 4
+
+struct S3 {
+  char c;
+  short s;
+};
+
+unsigned S3_align = __alignof(struct S3);
+unsigned S3_size = sizeof(struct S3);
+// CHECK: @S3_align = global i32 2, align 4
+// CHECK: @S3_size = global i32 4, align 4
+
+unsigned S3_c_offset = (unsigned) &((struct S3*) 0)->c;
+unsigned S3_s_offset = (unsigned) &((struct S3*) 0)->s;
+// CHECK: @S3_c_offset = global i32 0, align 4
+// CHECK: @S3_s_offset = global i32 2, align 4
+
+struct S4 {
+  char c;
+  double d;
+  short s;
+};
+
+unsigned S4_align = __alignof(struct S4);
+unsigned S4_size = sizeof(struct S4);
+// CHECK-ON: @S4_align = global i32 8, align 4
+// CHECK-ON: @S4_size = global i32 24, align 4
+// CHECK-OFF: @S4_align = global i32 4, align 4
+// CHECK-OFF: @S4_size = global i32 16, align 4
+
+unsigned S4_c_offset = (unsigned) &((struct S4*) 0)->c;
+unsigned S4_d_offset = (unsigned) &((struct S4*) 0)->d;
+unsigned S4_s_offset = (unsigned) &((struct S4*) 0)->s;
+// CHECK: @S4_c_offset = global i32 0, align 4
+// CHECK-ON: @S4_d_offset = global i32 8, align 4
+// CHECK-ON: @S4_s_offset = global i32 16, align 4
+// CHECK-OFF: @S4_d_offset = global i32 4, align 4
+// CHECK-OFF: @S4_s_offset = global i32 12, align 4
+
+union S5 {
+  char c;
+  short s;
+  int j;
+};
+
+unsigned S5_align = __alignof(union S5);
+unsigned S5_size = sizeof(union S5);
+// CHECK: @S5_align = global i32 4, align 4
+// CHECK: @S5_size = global i32 4, align 4
+
+unsigned S5_c_offset = (unsigned) &((union S5*) 0)->c;
+unsigned S5_s_offset = (unsigned) &((union S5*) 0)->s;
+unsigned S5_j_offset = (unsigned) &((union S5*) 0)->j;
+// CHECK: @S5_c_offset = global i32 0, align 4
+// CHECK: @S5_s_offset = global i32 0, align 4
+// CHECK: @S5_j_offset = global i32 0, align 4
+
+union S6 {
+  char c;
+  double d;
+};
+
+unsigned S6_align = __alignof(union S6);
+unsigned S6_size = sizeof(union S6);
+// CHECK-ON: @S6_align = global i32 8, align 4
+// CHECK-ON: @S6_size = global i32 8, align 4
+// CHECK-OFF: @S6_align = global i32 4, align 4
+// CHECK-OFF: @S6_size = global i32 8, align 4
+
+unsigned S6_c_offset = (unsigned) &((union S6*) 0)->c;
+unsigned S6_d_offset = (unsigned) &((union S6*) 0)->d;
+// CHECK: @S6_c_offset = global i32 0, align 4
+// CHECK: @S6_d_offset = global i32 0, align 4
diff --git a/test/CodeGen/mbackchain-2.c b/test/CodeGen/mbackchain-2.c
new file mode 100644
index 0000000000000..e76afaf7687a7
--- /dev/null
+++ b/test/CodeGen/mbackchain-2.c
@@ -0,0 +1,7 @@
+// RUN: %clang -mbackchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK: attributes [[NUW]] = { {{.*}} "backchain" {{.*}} }
diff --git a/test/CodeGen/mbackchain-3.c b/test/CodeGen/mbackchain-3.c
new file mode 100644
index 0000000000000..b115861f5e02a
--- /dev/null
+++ b/test/CodeGen/mbackchain-3.c
@@ -0,0 +1,7 @@
+// RUN: %clang -mno-backchain --target=s390x-linux -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK-NOT: "backchain"
diff --git a/test/CodeGen/mbackchain.c b/test/CodeGen/mbackchain.c
new file mode 100644
index 0000000000000..e7cfc3aec52b5
--- /dev/null
+++ b/test/CodeGen/mbackchain.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -mbackchain -triple s390x-linux -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define void @foo() [[NUW:#[0-9]+]]
+void foo(void) {
+}
+
+// CHECK: attributes [[NUW]] = { {{.*}} "backchain" {{.*}} }
diff --git a/test/CodeGen/mcu-struct-return.c b/test/CodeGen/mcu-struct-return.c
new file mode 100644
index 0000000000000..353c963dadb01
--- /dev/null
+++ b/test/CodeGen/mcu-struct-return.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -triple i386-pc-elfiamcu -emit-llvm %s -o - | FileCheck %s
+
+// Structure that is more than 8 byte.
+struct Big {
+  double a[10];
+};
+
+// Empty union with zero size must be returned as void.
+union U1 {
+} u1;
+
+// Too large union (80 bytes) must be returned via memory.
+union U2 {
+  struct Big b;
+} u2;
+
+// Must be returned in register.
+union U3 {
+  int x;
+} u3;
+
+// Empty struct with zero size, must be returned as void.
+struct S1 {
+} s1;
+
+// Must be returend in register.
+struct S2 {
+  int x;
+} s2;
+
+// CHECK: [[UNION1_TYPE:%.+]] = type {}
+// CHECK: [[UNION2_TYPE:%.+]] = type { [[STRUCT_TYPE:%.+]] }
+// CHECK: [[STRUCT_TYPE]] = type { [10 x double] }
+// CHECK: [[UNION3_TYPE:%.+]] = type { i32 }
+// CHECK: [[STRUCT1_TYPE:%.+]] = type {}
+// CHECK: [[STRUCT2_TYPE:%.+]] = type { i32 }
+
+union U1 foo1() { return u1; }
+union U2 foo2() { return u2; }
+union U3 foo3() { return u3; }
+struct S1 bar1() { return s1; }
+struct S2 bar2() { return s2; }
+struct S1 bar3(union U1 u) { return s1; }
+// CHECK: define void @foo1()
+// CHECK: define void @foo2([[UNION2_TYPE]]* noalias sret %{{.+}})
+// CHECK: define i32 @foo3()
+// CHECK: define void @bar1()
+// CHECK: define i32 @bar2()
+// CHECK: define void @bar3()
+
+void run() {
+  union U1 x1 = foo1();
+  union U2 x2 = foo2();
+  union U3 x3 = foo3();
+  struct S1 y1 = bar1();
+  struct S2 y2 = bar2();
+  struct S1 y3 = bar3(x1);
+
+  // CHECK: [[X1:%.+]] = alloca [[UNION1_TYPE]]
+  // CHECK: [[X2:%.+]] = alloca [[UNION2_TYPE]]
+  // CHECK: [[X3:%.+]] = alloca [[UNION3_TYPE]]
+  // CHECK: [[Y1:%.+]] = alloca [[STRUCT1_TYPE]]
+  // CHECK: [[Y2:%.+]] = alloca [[STRUCT2_TYPE]]
+  // CHECK: call void @foo1()
+  // CHECK: call void @foo2([[UNION2_TYPE]]* sret [[X2]])
+  // CHECK: {{.+}} = call i32 @foo3()
+  // CHECK: call void @bar1()
+  // CHECK: {{.+}} = call i32 @bar2()
+  // CHECK: call void @bar3()
+}
diff --git a/test/CodeGen/mips-byval-arg.c b/test/CodeGen/mips-byval-arg.c
index 0e3d334b27453..1e7f38915ccb9 100644
--- a/test/CodeGen/mips-byval-arg.c
+++ b/test/CodeGen/mips-byval-arg.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang -target mips64el-unknown-linux -O3 -S -mabi=n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
 
 typedef struct {
   float f[3];
diff --git a/test/CodeGen/mips-inline-asm.c b/test/CodeGen/mips-inline-asm.c
index 2cfa41c98de73..fa38663f387df 100644
--- a/test/CodeGen/mips-inline-asm.c
+++ b/test/CodeGen/mips-inline-asm.c
@@ -17,3 +17,15 @@ void R () {
   asm("lw $1, %0" :: "R"(data));
   // CHECK: call void asm sideeffect "lw $$1, $0", "*R,~{$1}"(i32* @data)
 }
+
+int additionalClobberedRegisters () {
+  int temp0;
+  asm volatile(
+                "mfhi %[temp0], $ac1 \n\t"
+                  : [temp0]"=&r"(temp0)
+                  :
+                  : "memory", "t0", "t1", "$ac1hi", "$ac1lo", "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+  );
+  return 0;
+  // CHECK: call i32 asm sideeffect "mfhi $0, $$ac1 \0A\09", "=&r,~{memory},~{$8},~{$9},~{$ac1hi},~{$ac1lo},~{$ac2hi},~{$ac2lo},~{$ac3hi},~{$ac3lo},~{$1}"
+}
diff --git a/test/CodeGen/mips-interrupt-attr.c b/test/CodeGen/mips-interrupt-attr.c
index df70b12b58f13..0ef5dabd26572 100644
--- a/test/CodeGen/mips-interrupt-attr.c
+++ b/test/CodeGen/mips-interrupt-attr.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple mipsel-unknown-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -emit-llvm -o - %s | FileCheck %s
 
 void __attribute__ ((interrupt("vector=sw0")))
 isr_sw0 (void)
diff --git a/test/CodeGen/mips-vector-arg.c b/test/CodeGen/mips-vector-arg.c
index f8c89dfff5483..1b9d7abe4d583 100644
--- a/test/CodeGen/mips-vector-arg.c
+++ b/test/CodeGen/mips-vector-arg.c
@@ -1,5 +1,5 @@
-// RUN: %clang -target mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang -target mips64el-unknown-linux -O3 -S -mabi=n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
 
 // check that
 // 1. vector arguments are passed in integer registers
@@ -8,18 +8,18 @@
 typedef float  v4sf __attribute__ ((__vector_size__ (16)));
 typedef int v4i32 __attribute__ ((__vector_size__ (16)));
 
-// O32: define void @test_v4sf(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) [[NUW:#[0-9]+]]
+// O32: define void @test_v4sf(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) local_unnamed_addr [[NUW:#[0-9]+]]
 // O32: declare i32 @test_v4sf_2(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 signext, i32, i32 inreg, i32 inreg, i32 inreg, i32 inreg)
-// N64: define void @test_v4sf(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) [[NUW:#[0-9]+]]
+// N64: define void @test_v4sf(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) local_unnamed_addr [[NUW:#[0-9]+]]
 // N64: declare i32 @test_v4sf_2(i64 inreg, i64 inreg, i32 signext, i64, i64 inreg, i64 inreg)
 extern test_v4sf_2(v4sf, int, v4sf);
 void test_v4sf(v4sf a1, int a2, v4sf a3) {
   test_v4sf_2(a3, a2, a1);
 }
 
-// O32: define void @test_v4i32(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) [[NUW]]
+// O32: define void @test_v4i32(i32 inreg %a1.coerce0, i32 inreg %a1.coerce1, i32 inreg %a1.coerce2, i32 inreg %a1.coerce3, i32 signext %a2, i32, i32 inreg %a3.coerce0, i32 inreg %a3.coerce1, i32 inreg %a3.coerce2, i32 inreg %a3.coerce3) local_unnamed_addr [[NUW]]
 // O32: declare i32 @test_v4i32_2(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 signext, i32, i32 inreg, i32 inreg, i32 inreg, i32 inreg)
-// N64: define void @test_v4i32(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) [[NUW]]
+// N64: define void @test_v4i32(i64 inreg %a1.coerce0, i64 inreg %a1.coerce1, i32 signext %a2, i64, i64 inreg %a3.coerce0, i64 inreg %a3.coerce1) local_unnamed_addr [[NUW]]
 // N64: declare i32 @test_v4i32_2(i64 inreg, i64 inreg, i32 signext, i64, i64 inreg, i64 inreg)
 extern test_v4i32_2(v4i32, int, v4i32);
 void test_v4i32(v4i32 a1, int a2, v4i32 a3) {
diff --git a/test/CodeGen/mips-zero-sized-struct.c b/test/CodeGen/mips-zero-sized-struct.c
index afff3b41d833c..217bdb92c6f68 100644
--- a/test/CodeGen/mips-zero-sized-struct.c
+++ b/test/CodeGen/mips-zero-sized-struct.c
@@ -1,9 +1,9 @@
-// RUN: %clang -target mips-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang -target mipsel-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
-// RUN: %clang -target mips64-unknown-linux-gnu -S -emit-llvm -o - %s -mabi=n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang -target mips64el-unknown-linux-gnu -S -emit-llvm -o - %s -mabi=n32 | FileCheck -check-prefix=N32 %s
-// RUN: %clang -target mips64-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
-// RUN: %clang -target mips64el-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mipsel-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=O32 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -S -emit-llvm -o - %s  -target-abi n32 | FileCheck -check-prefix=N32 %s
+// RUN: %clang_cc1 -triple mips64-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux-gnu -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s
 
 // O32: define void @fn28(%struct.T2* noalias sret %agg.result, i8 signext %arg0)
 // N32: define void @fn28(i8 signext %arg0)
diff --git a/test/CodeGen/mips64-class-return.cpp b/test/CodeGen/mips64-class-return.cpp
index 57fa8ef5109b7..af2dd5cbec2fb 100644
--- a/test/CodeGen/mips64-class-return.cpp
+++ b/test/CodeGen/mips64-class-return.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang -target mips64el-unknown-linux -O3 -S -mabi=n64 -o - -emit-llvm %s | FileCheck %s
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s
 
 class B0 {
   double d;
diff --git a/test/CodeGen/mips64-padding-arg.c b/test/CodeGen/mips64-padding-arg.c
index b92098f45a4e6..7910734d6f16e 100644
--- a/test/CodeGen/mips64-padding-arg.c
+++ b/test/CodeGen/mips64-padding-arg.c
@@ -1,6 +1,6 @@
-// RUN: %clang -target mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
-// RUN: %clang -target mips64el-unknown-linux -O3 -S -mabi=n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
-// RUN: %clang -target mipsel-unknown-linux -mfp64 -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
+// RUN: %clang_cc1 -triple mips64el-unknown-linux -O3 -S -target-abi n64 -o - -emit-llvm %s | FileCheck %s -check-prefix=N64
+// RUN: %clang_cc1 -triple mipsel-unknown-linux -target-feature "+fp64" -O3 -S -o - -emit-llvm %s | FileCheck %s -check-prefix=O32
 
 typedef struct {
   double d;
diff --git a/test/CodeGen/mmx-builtins.c b/test/CodeGen/mmx-builtins.c
index 44d1ea4d57e8b..2bf497d58aaf0 100644
--- a/test/CodeGen/mmx-builtins.c
+++ b/test/CodeGen/mmx-builtins.c
@@ -44,8 +44,8 @@ __m64 test_mm_add_pi32(__m64 a, __m64 b) {
 
 __m64 test_mm_add_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_add_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q
-  return __builtin_ia32_paddq(a, b);
+  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  return _mm_add_si64(a, b);
 }
 
 __m64 test_mm_adds_pi8(__m64 a, __m64 b) {
@@ -217,6 +217,12 @@ __m64 test_mm_cvttps_pi32(__m128 a) {
   return _mm_cvttps_pi32(a);
 }
 
+int test_mm_extract_pi16(__m64 a) {
+  // CHECK-LABEL: test_mm_extract_pi16
+  // CHECK: call i32 @llvm.x86.mmx.pextr.w
+  return _mm_extract_pi16(a, 2);
+}
+
 __m64 test_m_from_int(int a) {
   // CHECK-LABEL: test_m_from_int
   // CHECK: insertelement <2 x i32>
@@ -265,6 +271,12 @@ __m64 test_mm_hsubs_pi16(__m64 a, __m64 b) {
   return _mm_hsubs_pi16(a, b);
 }
 
+__m64 test_mm_insert_pi16(__m64 a, int d) {
+  // CHECK-LABEL: test_mm_insert_pi16
+  // CHECK: call x86_mmx @llvm.x86.mmx.pinsr.w
+  return _mm_insert_pi16(a, d, 2);
+}
+
 __m64 test_mm_madd_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_madd_pi16
   // CHECK: call x86_mmx @llvm.x86.mmx.pmadd.wd
@@ -315,7 +327,7 @@ int test_mm_movemask_pi8(__m64 a) {
 
 __m64 test_mm_mul_su32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_mul_su32
-  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq
+  // CHECK: call x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
   return _mm_mul_su32(a, b);
 }
 
@@ -525,8 +537,8 @@ __m64 test_mm_sub_pi32(__m64 a, __m64 b) {
 
 __m64 test_mm_sub_si64(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sub_si64
-  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q
-  return __builtin_ia32_psubq(a, b);
+  // CHECK: call x86_mmx @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
+  return _mm_sub_si64(a, b);
 }
 
 __m64 test_mm_subs_pi8(__m64 a, __m64 b) {
diff --git a/test/CodeGen/ms-inline-asm-avx512.c b/test/CodeGen/ms-inline-asm-avx512.c
new file mode 100644
index 0000000000000..c1b783a2107cb
--- /dev/null
+++ b/test/CodeGen/ms-inline-asm-avx512.c
@@ -0,0 +1,21 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 %s -triple x86_64-pc-windows-msvc -target-cpu knl -fasm-blocks -emit-llvm -o - | FileCheck %s
+
+void t1() {
+// CHECK: @t1
+// CHECK: call void asm sideeffect inteldialect "vaddpd zmm8, zmm27, zmm6", "~{zmm8},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: ret void
+  __asm {
+	  vaddpd zmm8, zmm27, zmm6
+  }
+}
+
+
+void t2() {
+// CHECK: @t2
+// CHECK: call void asm sideeffect inteldialect "vaddpd zmm8 {k1}, zmm27, zmm6", "~{zmm8},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: ret void
+  __asm {
+	  vaddpd zmm8 {k1}, zmm27, zmm6
+  }
+}
diff --git a/test/CodeGen/ms-inline-asm-errors.cpp b/test/CodeGen/ms-inline-asm-errors.cpp
new file mode 100644
index 0000000000000..6484743e1f7ae
--- /dev/null
+++ b/test/CodeGen/ms-inline-asm-errors.cpp
@@ -0,0 +1,15 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -x c++ %s -triple i386-apple-darwin10 -std=c++11 -fasm-blocks -verify
+
+class A {
+public:
+  void foo(int a)   {}
+  void foo(float a) {}
+};
+
+
+void t_fail() {
+	__asm {
+		mov ecx, [eax]A.foo // expected-error {{Unable to lookup field reference!}}
+	}
+}
diff --git a/test/CodeGen/ms-inline-asm.c b/test/CodeGen/ms-inline-asm.c
index 2f5de676c72fe..ad9e4f361b707 100644
--- a/test/CodeGen/ms-inline-asm.c
+++ b/test/CodeGen/ms-inline-asm.c
@@ -63,10 +63,19 @@ void t7() {
 int t8() {
   __asm int 4 ; } comments for single-line asm
   __asm {}
-  __asm int 4
+  __asm { int 5}
+  __asm int 6
+  __asm int 7
+  __asm { 
+    int 8
+  }
   return 10;
 // CHECK: t8
-// CHECK: call i32 asm sideeffect inteldialect "int $$4\0A\09int $$4", "={eax},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call i32 asm sideeffect inteldialect "int $$4", "={eax},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call i32 asm sideeffect inteldialect "", "={eax},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call i32 asm sideeffect inteldialect "int $$5", "={eax},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call i32 asm sideeffect inteldialect "int $$6\0A\09int $$7", "={eax},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call i32 asm sideeffect inteldialect "int $$8", "={eax},~{dirflag},~{fpsr},~{flags}"()
 // CHECK: ret i32 10
 }
 
@@ -77,7 +86,7 @@ void t9() {
     __asm { pop ebx }
   }
 // CHECK: t9
-// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$0x07\0A\09pop ebx", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
+// CHECK: call void asm sideeffect inteldialect "push ebx\0A\09mov ebx, $$0x07\0A\09pop ebx\0A\09", "~{ebx},~{esp},~{dirflag},~{fpsr},~{flags}"()
 }
 
 unsigned t10(void) {
diff --git a/test/CodeGen/ms-intrinsics.c b/test/CodeGen/ms-intrinsics.c
index 9103622197a9e..ceaa847e9987c 100644
--- a/test/CodeGen/ms-intrinsics.c
+++ b/test/CodeGen/ms-intrinsics.c
@@ -8,17 +8,17 @@
 // RUN:         -triple x86_64--windows -Oz -emit-llvm %s -o - \
 // RUN:         | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-X64
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
 
-#include <Intrin.h>
+#include <intrin.h>
 
 void *test_InterlockedExchangePointer(void * volatile *Target, void *Value) {
   return _InterlockedExchangePointer(Target, Value);
 }
 
-// CHECK: define{{.*}}i8* @test_InterlockedExchangePointer(i8** %Target, i8* %Value){{.*}}{
+// CHECK: define{{.*}}i8* @test_InterlockedExchangePointer(i8** {{[a-z_ ]*}}%Target, i8* {{[a-z_ ]*}}%Value){{.*}}{
 // CHECK:   %[[TARGET:[0-9]+]] = bitcast i8** %Target to [[iPTR:i[0-9]+]]*
 // CHECK:   %[[VALUE:[0-9]+]] = ptrtoint i8* %Value to [[iPTR]]
 // CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg [[iPTR]]* %[[TARGET]], [[iPTR]] %[[VALUE]] seq_cst
@@ -31,7 +31,7 @@ void *test_InterlockedCompareExchangePointer(void * volatile *Destination,
   return _InterlockedCompareExchangePointer(Destination, Exchange, Comparand);
 }
 
-// CHECK: define{{.*}}i8* @test_InterlockedCompareExchangePointer(i8** %Destination, i8* %Exchange, i8* %Comparand){{.*}}{
+// CHECK: define{{.*}}i8* @test_InterlockedCompareExchangePointer(i8** {{[a-z_ ]*}}%Destination, i8* {{[a-z_ ]*}}%Exchange, i8* {{[a-z_ ]*}}%Comparand){{.*}}{
 // CHECK:   %[[DEST:[0-9]+]] = bitcast i8** %Destination to [[iPTR]]*
 // CHECK:   %[[EXCHANGE:[0-9]+]] = ptrtoint i8* %Exchange to [[iPTR]]
 // CHECK:   %[[COMPARAND:[0-9]+]] = ptrtoint i8* %Comparand to [[iPTR]]
@@ -45,7 +45,7 @@ long test_InterlockedExchange(long *Target, long Value) {
   return _InterlockedExchange(Target, Value);
 }
 
-// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32* %Target, i32 %Value){{.*}}{
+// CHECK: define{{.*}}i32 @test_InterlockedExchange(i32* {{[a-z_ ]*}}%Target, i32 %Value){{.*}}{
 // CHECK:   %[[EXCHANGE:[0-9]+]] = atomicrmw xchg i32* %Target, i32 %Value seq_cst
 // CHECK:   ret i32 %[[EXCHANGE:[0-9]+]]
 // CHECK: }
diff --git a/test/CodeGen/ms-mm-align.c b/test/CodeGen/ms-mm-align.c
index ae8e98086a769..7130c74b29052 100644
--- a/test/CodeGen/ms-mm-align.c
+++ b/test/CodeGen/ms-mm-align.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 \
+// RUN: %clang_cc1 -ffreestanding -fms-extensions -fms-compatibility -fms-compatibility-version=17.00 -target-feature +sse \
 // RUN:         -triple i686--windows -emit-llvm %s -o - \
 // RUN:         | FileCheck %s -check-prefix CHECK
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
-#include <Intrin.h>
+#include <intrin.h>
 
 void capture_ptr(int* i);
 void test_mm_align16(int p) {
diff --git a/test/CodeGen/ms-volatile.c b/test/CodeGen/ms-volatile.c
index 87393e794f829..a3ef35a3faadc 100644
--- a/test/CodeGen/ms-volatile.c
+++ b/test/CodeGen/ms-volatile.c
@@ -7,6 +7,13 @@ struct bar {
 };
 typedef _Complex float __declspec(align(8)) baz;
 
+#pragma pack(push)
+#pragma pack(1)
+struct qux {
+   volatile int f;
+};
+#pragma pack(pop)
+
 void test1(struct foo *p, struct foo *q) {
   *p = *q;
   // CHECK-LABEL: @test1
@@ -52,11 +59,29 @@ void test7(volatile struct bar *p, volatile struct bar *q) {
 void test8(volatile double *p, volatile double *q) {
   *p = *q;
   // CHECK-LABEL: @test8
-  // CHECK: load atomic volatile {{.*}} acquire
-  // CHECK: store atomic volatile {{.*}}, {{.*}} release
+  // CHECK: load volatile {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
 }
 void test9(volatile baz *p, baz *q) {
   *p = *q;
   // CHECK-LABEL: @test9
+  // CHECK: store volatile {{.*}}, {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+void test10(volatile long long *p, volatile long long *q) {
+  *p = *q;
+  // CHECK-LABEL: @test10
+  // CHECK: load volatile {{.*}}
+  // CHECK: store volatile {{.*}}, {{.*}}
+}
+void test11(volatile float *p, volatile float *q) {
+  *p = *q;
+  // CHECK-LABEL: @test11
+  // CHECK: load atomic volatile {{.*}} acquire
   // CHECK: store atomic volatile {{.*}}, {{.*}} release
 }
+int test12(struct qux *p) {
+  return p->f;
+  // CHECK-LABEL: @test12
+  // CHECK: load volatile {{.*}}
+}
diff --git a/test/CodeGen/neon-immediate-ubsan.c b/test/CodeGen/neon-immediate-ubsan.c
index 3fe4b003972b5..c3e1ce23301d3 100644
--- a/test/CodeGen/neon-immediate-ubsan.c
+++ b/test/CodeGen/neon-immediate-ubsan.c
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple armv7s-linux-gnu -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a8 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=ARMV7
 
-// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -O1 -o - %s \
+// RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm -o - %s \
 // RUN:     -target-feature +neon -target-cpu cortex-a53 \
 // RUN:     -fsanitize=signed-integer-overflow \
 // RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=AARCH64
diff --git a/test/CodeGen/nousejumptable.c b/test/CodeGen/nousejumptable.c
new file mode 100644
index 0000000000000..91ad5818570db
--- /dev/null
+++ b/test/CodeGen/nousejumptable.c
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -S -fno-jump-tables %s -emit-llvm -o - | FileCheck %s
+
+// CHECK-LABEL: main
+// CHECK: attributes #0 = {{.*}}"no-jump-tables"="true"{{.*}}
+
+int main() {
+  return 0;
+}
diff --git a/test/CodeGen/nvptx-cpus.c b/test/CodeGen/nvptx-cpus.c
index 015f52946ef73..76c55c0edf63c 100644
--- a/test/CodeGen/nvptx-cpus.c
+++ b/test/CodeGen/nvptx-cpus.c
@@ -3,6 +3,9 @@
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_30 -O3 -S -o %t %s -emit-llvm
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_35 -O3 -S -o %t %s -emit-llvm
 // RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_37 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_50 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_52 -O3 -S -o %t %s -emit-llvm
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -O3 -S -o %t %s -emit-llvm
 
 // Make sure clang accepts all supported architectures.
 
diff --git a/test/CodeGen/object-size.c b/test/CodeGen/object-size.c
index 610e54150d808..6aee57375a48e 100644
--- a/test/CodeGen/object-size.c
+++ b/test/CodeGen/object-size.c
@@ -505,7 +505,7 @@ void test31() {
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
   gi = __builtin_object_size(ds1[9].snd, 1);
 
-  // CHECH: store i32 2
+  // CHECK: store i32 2
   gi = __builtin_object_size(&ss[9].snd[0], 1);
 
   // CHECK: call i64 @llvm.objectsize.i64.p0i8(i8* %{{.*}}, i1 false)
diff --git a/test/CodeGen/overloadable.c b/test/CodeGen/overloadable.c
index 4946c6d92866b..634820cfe743e 100644
--- a/test/CodeGen/overloadable.c
+++ b/test/CodeGen/overloadable.c
@@ -29,3 +29,33 @@ int main() {
   cdv = f(cdv);
   vv = f(vv);
 }
+
+// Ensuring that we pick the correct function for taking the address of an
+// overload when conversions are involved.
+
+void addrof_many(int *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_many(void *a) __attribute__((overloadable));
+void addrof_many(char *a) __attribute__((overloadable));
+
+void addrof_single(int *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_single(char *a) __attribute__((overloadable, enable_if(0, "")));
+void addrof_single(char *a) __attribute__((overloadable));
+
+// CHECK-LABEL: define void @foo
+void foo() {
+  // CHECK: store void (i8*)* @_Z11addrof_manyPc
+  void (*p1)(char *) = &addrof_many;
+  // CHECK: store void (i8*)* @_Z11addrof_manyPv
+  void (*p2)(void *) = &addrof_many;
+  // CHECK: void (i8*)* @_Z11addrof_manyPc
+  void *vp1 = (void (*)(char *)) & addrof_many;
+  // CHECK: void (i8*)* @_Z11addrof_manyPv
+  void *vp2 = (void (*)(void *)) & addrof_many;
+
+  // CHECK: store void (i8*)* @_Z13addrof_singlePc
+  void (*p3)(char *) = &addrof_single;
+  // CHECK: @_Z13addrof_singlePc
+  void (*p4)(int *) = &addrof_single;
+  // CHECK: @_Z13addrof_singlePc
+  void *vp3 = &addrof_single;
+}
diff --git a/test/CodeGen/packed-arrays.c b/test/CodeGen/packed-arrays.c
index bb742c6f311b4..a90766fae40e0 100644
--- a/test/CodeGen/packed-arrays.c
+++ b/test/CodeGen/packed-arrays.c
@@ -23,32 +23,32 @@ struct __attribute__((packed)) s3 {
   unsigned int z;
 };
 
-// CHECK: @align0 = global i32 1
+// CHECK: @align0 = local_unnamed_addr global i32 1
 int align0 = __alignof(struct s0);
-// CHECK: @align1 = global i32 4
+// CHECK: @align1 = local_unnamed_addr global i32 4
 int align1 = __alignof(struct s1);
-// CHECK: @align2 = global i32 1
+// CHECK: @align2 = local_unnamed_addr global i32 1
 int align2 = __alignof(struct s2);
-// CHECK: @align3 = global i32 1
+// CHECK: @align3 = local_unnamed_addr global i32 1
 int align3 = __alignof(struct s3);
 
-// CHECK: @align0_x = global i32 1
+// CHECK: @align0_x = local_unnamed_addr global i32 1
 int align0_x = __alignof(((struct s0*) 0)->x);
 //
-// CHECK: @align1_x = global i32 1
+// CHECK: @align1_x = local_unnamed_addr global i32 1
 int align1_x = __alignof(((struct s1*) 0)->x);
-// CHECK: @align2_x = global i32 1
+// CHECK: @align2_x = local_unnamed_addr global i32 1
 int align2_x = __alignof(((struct s2*) 0)->x);
-// CHECK: @align3_x = global i32 1
+// CHECK: @align3_x = local_unnamed_addr global i32 1
 int align3_x = __alignof(((struct s3*) 0)->x);
 
-// CHECK: @align0_x0 = global i32 4
+// CHECK: @align0_x0 = local_unnamed_addr global i32 4
 int align0_x0 = __alignof(((struct s0*) 0)->x[0]);
-// CHECK: @align1_x0 = global i32 4
+// CHECK: @align1_x0 = local_unnamed_addr global i32 4
 int align1_x0 = __alignof(((struct s1*) 0)->x[0]);
-// CHECK: @align2_x0 = global i32 4
+// CHECK: @align2_x0 = local_unnamed_addr global i32 4
 int align2_x0 = __alignof(((struct s2*) 0)->x[0]);
-// CHECK: @align3_x0 = global i32 4
+// CHECK: @align3_x0 = local_unnamed_addr global i32 4
 int align3_x0 = __alignof(((struct s3*) 0)->x[0]);
 
 // CHECK-LABEL: define i32 @f0_a
diff --git a/test/CodeGen/pass-object-size.c b/test/CodeGen/pass-object-size.c
index 1ad3f853ca6d5..6e2bc2090eda4 100644
--- a/test/CodeGen/pass-object-size.c
+++ b/test/CodeGen/pass-object-size.c
@@ -351,3 +351,18 @@ void test13() {
   ObjectSize0(++p);
   ObjectSize0(p++);
 }
+
+// There was a bug where variadic functions with pass_object_size would cause
+// problems in the form of failed assertions.
+void my_sprintf(char *const c __attribute__((pass_object_size(0))), ...) {}
+
+// CHECK-LABEL: define void @test14
+void test14(char *c) {
+  // CHECK: @llvm.objectsize
+  // CHECK: call void (i8*, i64, ...) @my_sprintf
+  my_sprintf(c);
+
+  // CHECK: @llvm.objectsize
+  // CHECK: call void (i8*, i64, ...) @my_sprintf
+  my_sprintf(c, 1, 2, 3);
+}
diff --git a/test/CodeGen/pgo-instrumentation.c b/test/CodeGen/pgo-instrumentation.c
new file mode 100644
index 0000000000000..1dac36fa589ec
--- /dev/null
+++ b/test/CodeGen/pgo-instrumentation.c
@@ -0,0 +1,20 @@
+// Test if PGO instrumentation and use pass are invoked.
+//
+// Ensure Pass PGOInstrumentationGenPass is invoked.
+// RUN: %clang_cc1 -O2 -fprofile-instrument=llvm %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOGENPASS-INVOKED-INSTR-GEN
+// CHECK-PGOGENPASS-INVOKED-INSTR-GEN: PGOInstrumentationGenPass
+//
+// Ensure Pass PGOInstrumentationGenPass is not invoked.
+// RUN: %clang_cc1 -O2 -fprofile-instrument=clang %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOGENPASS-INVOKED-INSTR-GEN-CLANG
+// CHECK-PGOGENPASS-INVOKED-INSTR-GEN-CLANG-NOT: PGOInstrumentationGenPass
+
+// Ensure Pass PGOInstrumentationUsePass is invoked.
+// RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestir.profraw
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-INSTR-USE
+// CHECK-PGOUSEPASS-INVOKED-INSTR-USE: PGOInstrumentationUsePass
+//
+// Ensure Pass PGOInstrumentationUsePass is not invoked.
+// RUN: llvm-profdata merge -o %t.profdata %S/Inputs/pgotestclang.profraw
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.profdata %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=CHECK-PGOUSEPASS-INVOKED-USE-CLANG
+// CHECK-PGOUSEPASS-INVOKED-USE-CLANG-NOT: PGOInstrumentationUsePass
+
diff --git a/test/CodeGen/pgo-sample-preparation.c b/test/CodeGen/pgo-sample-preparation.c
new file mode 100644
index 0000000000000..c0a3cb4bc5944
--- /dev/null
+++ b/test/CodeGen/pgo-sample-preparation.c
@@ -0,0 +1,16 @@
+// Test if PGO sample use preparation passes are executed correctly.
+//
+// Ensure that instcombine is executed after simplifycfg and sroa so that
+// "a < 255" will not be converted to a * 256 < 255 * 256.
+// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - 2>&1 | FileCheck %s
+
+void bar(int);
+void foo(int x, int y, int z) {
+  int m;
+  for (m = 0; m < x ; m++) {
+    int a = (((y >> 8) & 0xff) * z) / 256;
+    bar(a < 255 ? a : 255);
+  }
+}
+
+// CHECK-NOT: icmp slt i32 %mul, 65280
diff --git a/test/CodeGen/pgo-sample.c b/test/CodeGen/pgo-sample.c
new file mode 100644
index 0000000000000..c955edfab7a42
--- /dev/null
+++ b/test/CodeGen/pgo-sample.c
@@ -0,0 +1,9 @@
+// Test if PGO sample use passes are invoked.
+//
+// Ensure Pass PGOInstrumentationGenPass is invoked.
+// RUN: %clang_cc1 -O2 -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -mllvm -debug-pass=Structure -emit-llvm -o - 2>&1 | FileCheck %s
+// CHECK: Simplify the CFG
+// CHECK: SROA
+// CHECK: Combine redundant instructions
+// CHECK: Remove unused exception handling info
+// CHECK: Sample profile pass
diff --git a/test/CodeGen/ppc64le-aggregates.c b/test/CodeGen/ppc64le-aggregates.c
index 3ad4b06c688a3..04d2fb4766ea2 100644
--- a/test/CodeGen/ppc64le-aggregates.c
+++ b/test/CodeGen/ppc64le-aggregates.c
@@ -255,84 +255,84 @@ struct v3f9 { float3 v[9]; };
 struct v3fab { float3 a; float3 b; };
 struct v3fabc { float3 a; float3 b; float3 c; };
 
-// CHECK: define [1 x <3 x float>] @func_v3f1(<3 x float> inreg %x.coerce)
+// CHECK: define [1 x <4 x float>] @func_v3f1(<3 x float> inreg %x.coerce)
 struct v3f1 func_v3f1(struct v3f1 x) { return x; }
 
-// CHECK: define [2 x <3 x float>] @func_v3f2([2 x <3 x float>] %x.coerce)
+// CHECK: define [2 x <4 x float>] @func_v3f2([2 x <4 x float>] %x.coerce)
 struct v3f2 func_v3f2(struct v3f2 x) { return x; }
 
-// CHECK: define [3 x <3 x float>] @func_v3f3([3 x <3 x float>] %x.coerce)
+// CHECK: define [3 x <4 x float>] @func_v3f3([3 x <4 x float>] %x.coerce)
 struct v3f3 func_v3f3(struct v3f3 x) { return x; }
 
-// CHECK: define [4 x <3 x float>] @func_v3f4([4 x <3 x float>] %x.coerce)
+// CHECK: define [4 x <4 x float>] @func_v3f4([4 x <4 x float>] %x.coerce)
 struct v3f4 func_v3f4(struct v3f4 x) { return x; }
 
-// CHECK: define [5 x <3 x float>] @func_v3f5([5 x <3 x float>] %x.coerce)
+// CHECK: define [5 x <4 x float>] @func_v3f5([5 x <4 x float>] %x.coerce)
 struct v3f5 func_v3f5(struct v3f5 x) { return x; }
 
-// CHECK: define [6 x <3 x float>] @func_v3f6([6 x <3 x float>] %x.coerce)
+// CHECK: define [6 x <4 x float>] @func_v3f6([6 x <4 x float>] %x.coerce)
 struct v3f6 func_v3f6(struct v3f6 x) { return x; }
 
-// CHECK: define [7 x <3 x float>] @func_v3f7([7 x <3 x float>] %x.coerce)
+// CHECK: define [7 x <4 x float>] @func_v3f7([7 x <4 x float>] %x.coerce)
 struct v3f7 func_v3f7(struct v3f7 x) { return x; }
 
-// CHECK: define [8 x <3 x float>] @func_v3f8([8 x <3 x float>] %x.coerce)
+// CHECK: define [8 x <4 x float>] @func_v3f8([8 x <4 x float>] %x.coerce)
 struct v3f8 func_v3f8(struct v3f8 x) { return x; }
 
 // CHECK: define void @func_v3f9(%struct.v3f9* noalias sret %agg.result, %struct.v3f9* byval align 16 %x)
 struct v3f9 func_v3f9(struct v3f9 x) { return x; }
 
-// CHECK: define [2 x <3 x float>] @func_v3fab([2 x <3 x float>] %x.coerce)
+// CHECK: define [2 x <4 x float>] @func_v3fab([2 x <4 x float>] %x.coerce)
 struct v3fab func_v3fab(struct v3fab x) { return x; }
 
-// CHECK: define [3 x <3 x float>] @func_v3fabc([3 x <3 x float>] %x.coerce)
+// CHECK: define [3 x <4 x float>] @func_v3fabc([3 x <4 x float>] %x.coerce)
 struct v3fabc func_v3fabc(struct v3fabc x) { return x; }
 
 // CHECK-LABEL: @call_v3f1
 // CHECK: %[[TMP:[^ ]+]] = load <3 x float>, <3 x float>* getelementptr inbounds (%struct.v3f1, %struct.v3f1* @global_v3f1, i32 0, i32 0, i32 0), align 1
-// CHECK: call [1 x <3 x float>] @func_v3f1(<3 x float> inreg %[[TMP]])
+// CHECK: call [1 x <4 x float>] @func_v3f1(<3 x float> inreg %[[TMP]])
 struct v3f1 global_v3f1;
 void call_v3f1(void) { global_v3f1 = func_v3f1(global_v3f1); }
 
 // CHECK-LABEL: @call_v3f2
-// CHECK: %[[TMP:[^ ]+]] = load [2 x <3 x float>], [2 x <3 x float>]* getelementptr inbounds (%struct.v3f2, %struct.v3f2* @global_v3f2, i32 0, i32 0), align 1
-// CHECK: call [2 x <3 x float>] @func_v3f2([2 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [2 x <4 x float>], [2 x <4 x float>]* bitcast (%struct.v3f2* @global_v3f2 to [2 x <4 x float>]*), align 16
+// CHECK: call [2 x <4 x float>] @func_v3f2([2 x <4 x float>] %[[TMP]])
 struct v3f2 global_v3f2;
 void call_v3f2(void) { global_v3f2 = func_v3f2(global_v3f2); }
 
 // CHECK-LABEL: @call_v3f3
-// CHECK: %[[TMP:[^ ]+]] = load [3 x <3 x float>], [3 x <3 x float>]* getelementptr inbounds (%struct.v3f3, %struct.v3f3* @global_v3f3, i32 0, i32 0), align 1
-// CHECK: call [3 x <3 x float>] @func_v3f3([3 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [3 x <4 x float>], [3 x <4 x float>]* bitcast (%struct.v3f3* @global_v3f3 to [3 x <4 x float>]*), align 16
+// CHECK: call [3 x <4 x float>] @func_v3f3([3 x <4 x float>] %[[TMP]])
 struct v3f3 global_v3f3;
 void call_v3f3(void) { global_v3f3 = func_v3f3(global_v3f3); }
 
 // CHECK-LABEL: @call_v3f4
-// CHECK: %[[TMP:[^ ]+]] = load [4 x <3 x float>], [4 x <3 x float>]* getelementptr inbounds (%struct.v3f4, %struct.v3f4* @global_v3f4, i32 0, i32 0), align 1
-// CHECK: call [4 x <3 x float>] @func_v3f4([4 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [4 x <4 x float>], [4 x <4 x float>]* bitcast (%struct.v3f4* @global_v3f4 to [4 x <4 x float>]*), align 16
+// CHECK: call [4 x <4 x float>] @func_v3f4([4 x <4 x float>] %[[TMP]])
 struct v3f4 global_v3f4;
 void call_v3f4(void) { global_v3f4 = func_v3f4(global_v3f4); }
 
 // CHECK-LABEL: @call_v3f5
-// CHECK: %[[TMP:[^ ]+]] = load [5 x <3 x float>], [5 x <3 x float>]* getelementptr inbounds (%struct.v3f5, %struct.v3f5* @global_v3f5, i32 0, i32 0), align 1
-// CHECK: call [5 x <3 x float>] @func_v3f5([5 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [5 x <4 x float>], [5 x <4 x float>]* bitcast (%struct.v3f5* @global_v3f5 to [5 x <4 x float>]*), align 16
+// CHECK: call [5 x <4 x float>] @func_v3f5([5 x <4 x float>] %[[TMP]])
 struct v3f5 global_v3f5;
 void call_v3f5(void) { global_v3f5 = func_v3f5(global_v3f5); }
 
 // CHECK-LABEL: @call_v3f6
-// CHECK: %[[TMP:[^ ]+]] = load [6 x <3 x float>], [6 x <3 x float>]* getelementptr inbounds (%struct.v3f6, %struct.v3f6* @global_v3f6, i32 0, i32 0), align 1
-// CHECK: call [6 x <3 x float>] @func_v3f6([6 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [6 x <4 x float>], [6 x <4 x float>]* bitcast (%struct.v3f6* @global_v3f6 to [6 x <4 x float>]*), align 16
+// CHECK: call [6 x <4 x float>] @func_v3f6([6 x <4 x float>] %[[TMP]])
 struct v3f6 global_v3f6;
 void call_v3f6(void) { global_v3f6 = func_v3f6(global_v3f6); }
 
 // CHECK-LABEL: @call_v3f7
-// CHECK: %[[TMP:[^ ]+]] = load [7 x <3 x float>], [7 x <3 x float>]* getelementptr inbounds (%struct.v3f7, %struct.v3f7* @global_v3f7, i32 0, i32 0), align 1
-// CHECK: call [7 x <3 x float>] @func_v3f7([7 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [7 x <4 x float>], [7 x <4 x float>]* bitcast (%struct.v3f7* @global_v3f7 to [7 x <4 x float>]*), align 16
+// CHECK: call [7 x <4 x float>] @func_v3f7([7 x <4 x float>] %[[TMP]])
 struct v3f7 global_v3f7;
 void call_v3f7(void) { global_v3f7 = func_v3f7(global_v3f7); }
 
 // CHECK-LABEL: @call_v3f8
-// CHECK: %[[TMP:[^ ]+]] = load [8 x <3 x float>], [8 x <3 x float>]* getelementptr inbounds (%struct.v3f8, %struct.v3f8* @global_v3f8, i32 0, i32 0), align 1
-// CHECK: call [8 x <3 x float>] @func_v3f8([8 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [8 x <4 x float>], [8 x <4 x float>]* bitcast (%struct.v3f8* @global_v3f8 to [8 x <4 x float>]*), align 16
+// CHECK: call [8 x <4 x float>] @func_v3f8([8 x <4 x float>] %[[TMP]])
 struct v3f8 global_v3f8;
 void call_v3f8(void) { global_v3f8 = func_v3f8(global_v3f8); }
 
@@ -342,14 +342,14 @@ struct v3f9 global_v3f9;
 void call_v3f9(void) { global_v3f9 = func_v3f9(global_v3f9); }
 
 // CHECK-LABEL: @call_v3fab
-// CHECK: %[[TMP:[^ ]+]] = load [2 x <3 x float>], [2 x <3 x float>]* bitcast (%struct.v3fab* @global_v3fab to [2 x <3 x float>]*)
-// CHECK: call [2 x <3 x float>] @func_v3fab([2 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [2 x <4 x float>], [2 x <4 x float>]* bitcast (%struct.v3fab* @global_v3fab to [2 x <4 x float>]*), align 16
+// CHECK: call [2 x <4 x float>] @func_v3fab([2 x <4 x float>] %[[TMP]])
 struct v3fab global_v3fab;
 void call_v3fab(void) { global_v3fab = func_v3fab(global_v3fab); }
 
 // CHECK-LABEL: @call_v3fabc
-// CHECK: %[[TMP:[^ ]+]] = load [3 x <3 x float>], [3 x <3 x float>]* bitcast (%struct.v3fabc* @global_v3fabc to [3 x <3 x float>]*)
-// CHECK: call [3 x <3 x float>] @func_v3fabc([3 x <3 x float>] %[[TMP]])
+// CHECK: %[[TMP:[^ ]+]] = load [3 x <4 x float>], [3 x <4 x float>]* bitcast (%struct.v3fabc* @global_v3fabc to [3 x <4 x float>]*), align 16
+// CHECK: call [3 x <4 x float>] @func_v3fabc([3 x <4 x float>] %[[TMP]])
 struct v3fabc global_v3fabc;
 void call_v3fabc(void) { global_v3fabc = func_v3fabc(global_v3fabc); }
 
diff --git a/test/CodeGen/pr18235.c b/test/CodeGen/pr18235.c
index d3f12ee563318..49241c8229238 100644
--- a/test/CodeGen/pr18235.c
+++ b/test/CodeGen/pr18235.c
@@ -1,3 +1,3 @@
 // RUN: not %clang_cc1 -triple le32-unknown-nacl %s -S -o - 2>&1 | FileCheck %s
 
-// CHECK: error: unable to create target: 'No available targets are compatible with this triple, see -version for the available targets.'
+// CHECK: error: unable to create target: 'No available targets are compatible with this triple.
diff --git a/test/CodeGen/pr25786.c b/test/CodeGen/pr25786.c
new file mode 100644
index 0000000000000..612da7e8a3690
--- /dev/null
+++ b/test/CodeGen/pr25786.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-OK
+
+void (__attribute__((regparm(3), stdcall)) *pf) ();
+void (__attribute__((regparm(2), stdcall)) foo)(int a) {
+}
+// CHECK: @pf = common global void (...)* null
+// CHECK: define void @foo(i32 %a)
+
+// CHECK-OK: @pf = common global void (...)* null
+// CHECK-OK: define x86_stdcallcc void @foo(i32 inreg %a)
diff --git a/test/CodeGen/pr27892.c b/test/CodeGen/pr27892.c
new file mode 100644
index 0000000000000..694ce9eb0a529
--- /dev/null
+++ b/test/CodeGen/pr27892.c
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fms-extensions %s -emit-llvm -o - | FileCheck %s
+
+long test1(long *p) {
+  return _InterlockedIncrement(p);
+}
+// CHECK-DAG: define i64 @test1(
+// CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
+// CHECK:   store i64* %p, i64** %[[p_addr]], align 8
+// CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
+// CHECK:   %[[atomic_add:.*]] = atomicrmw volatile add i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[res:.*]] = add i64 %[[atomic_add]], 1
+// CHECK:   ret i64 %[[res]]
+
+long test2(long *p) {
+  return _InterlockedDecrement(p);
+}
+// CHECK-DAG: define i64 @test2(
+// CHECK:   %[[p_addr:.*]] = alloca i64*, align 8
+// CHECK:   store i64* %p, i64** %[[p_addr]], align 8
+// CHECK:   %[[p_load:.*]] = load i64*, i64** %[[p_addr]], align 8
+// CHECK:   %[[atomic_sub:.*]] = atomicrmw volatile sub i64* %[[p_load]], i64 1 seq_cst
+// CHECK:   %[[res:.*]] = sub i64 %[[atomic_sub]], 1
+// CHECK:   ret i64 %[[res]]
diff --git a/test/CodeGen/pragma-comment.c b/test/CodeGen/pragma-comment.c
index 6da20686e9571..71a7dfc0b0181 100644
--- a/test/CodeGen/pragma-comment.c
+++ b/test/CodeGen/pragma-comment.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-windows -fms-extensions -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -triple x86_64-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-linux-gnueabihf -fms-extensions -emit-llvm -o - | FileCheck -check-prefix LINUX %s
 // RUN: %clang_cc1 %s -triple i686-pc-linux -fms-extensions -emit-llvm -o - | FileCheck -check-prefix LINUX %s
 // RUN: %clang_cc1 %s -triple x86_64-scei-ps4 -fms-extensions -emit-llvm -o - | FileCheck -check-prefix PS4 %s
 
diff --git a/test/CodeGen/pragma-detect_mismatch.c b/test/CodeGen/pragma-detect_mismatch.c
index c5f3af340aae3..08259fc6be715 100644
--- a/test/CodeGen/pragma-detect_mismatch.c
+++ b/test/CodeGen/pragma-detect_mismatch.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple thumbv7-windows -fms-extensions -emit-llvm -o - | FileCheck %s
 
 #pragma detect_mismatch("test", "1")
 
diff --git a/test/CodeGen/preserve-call-conv.c b/test/CodeGen/preserve-call-conv.c
new file mode 100644
index 0000000000000..6e91a8489b405
--- /dev/null
+++ b/test/CodeGen/preserve-call-conv.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -triple arm64-unknown-unknown -emit-llvm < %s | FileCheck %s
+
+// Check that the preserve_most calling convention attribute at the source level
+// is lowered to the corresponding calling convention attrribute at the LLVM IR
+// level.
+void foo() __attribute__((preserve_most)) {
+  // CHECK-LABEL: define preserve_mostcc void @foo()
+}
+
+// Check that the preserve_most calling convention attribute at the source level
+// is lowered to the corresponding calling convention attrribute at the LLVM IR
+// level.
+void boo() __attribute__((preserve_all)) {
+  // CHECK-LABEL: define preserve_allcc void @boo()
+}
+
diff --git a/test/CodeGen/rd-builtins.c b/test/CodeGen/rd-builtins.c
new file mode 100644
index 0000000000000..5cad903909419
--- /dev/null
+++ b/test/CodeGen/rd-builtins.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -o - %s | FileCheck %s
+
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
+
+#include <x86intrin.h>
+
+unsigned long long test_rdpmc(int a) {
+  return _rdpmc(a);
+// CHECK: @test_rdpmc
+// CHECK: call i64 @llvm.x86.rdpmc
+}
+
+int test_rdtsc() {
+  return _rdtsc();
+// CHECK: @test_rdtsc
+// CHECK: call i64 @llvm.x86.rdtsc
+}
diff --git a/test/CodeGen/relax.c b/test/CodeGen/relax.c
new file mode 100644
index 0000000000000..07b7589be9753
--- /dev/null
+++ b/test/CodeGen/relax.c
@@ -0,0 +1,10 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-obj --mrelax-relocations %s -mrelocation-model pic -o %t
+// RUN: llvm-readobj -r %t | FileCheck  %s
+
+// CHECK: R_X86_64_REX_GOTPCRELX foo
+
+extern int foo;
+int *f(void) {
+  return &foo;
+}
diff --git a/test/CodeGen/renderscript.c b/test/CodeGen/renderscript.c
new file mode 100644
index 0000000000000..d47750a7736e4
--- /dev/null
+++ b/test/CodeGen/renderscript.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -triple=renderscript32-none-linux-gnueabi -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-RS32
+// RUN: %clang_cc1 %s -triple=renderscript64-none-linux-android -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-RS64
+// RUN: %clang_cc1 %s -triple=armv7-none-linux-gnueabi -emit-llvm -o - -Werror | FileCheck %s -check-prefix=CHECK-ARM
+
+// Ensure that the bitcode has the correct triple
+// CHECK-RS32: target triple = "armv7-none-linux-gnueabi"
+// CHECK-RS64: target triple = "aarch64-none-linux-android"
+// CHECK-ARM: target triple = "armv7-none-linux-gnueabi"
+
+// Ensure that long data type has 8-byte size and alignment in RenderScript
+#ifdef __RENDERSCRIPT__
+#define LONG_WIDTH_AND_ALIGN 8
+#else
+#define LONG_WIDTH_AND_ALIGN 4
+#endif
+
+_Static_assert(sizeof(long) == LONG_WIDTH_AND_ALIGN, "sizeof long is wrong");
+_Static_assert(_Alignof(long) == LONG_WIDTH_AND_ALIGN, "sizeof long is wrong");
+
+// CHECK-RS32: i64 @test_long(i64 %v)
+// CHECK-RS64: i64 @test_long(i64 %v)
+// CHECK-ARM: i32 @test_long(i32 %v)
+long test_long(long v) {
+  return v + 1;
+}
diff --git a/test/CodeGen/sparc-vaarg.c b/test/CodeGen/sparc-vaarg.c
new file mode 100644
index 0000000000000..3e4dd7c2c3f23
--- /dev/null
+++ b/test/CodeGen/sparc-vaarg.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple sparc -emit-llvm -o - %s | FileCheck %s
+#include <stdarg.h>
+
+// CHECK-LABEL: define i32 @get_int
+// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, i32{{$}}
+// CHECK: store i32 [[RESULT]], i32* [[LOC:%[a-z_0-9]+]]
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = load i32, i32* [[LOC]]
+// CHECK: ret i32 [[RESULT2]]
+int get_int(va_list *args) {
+  return va_arg(*args, int);
+}
+
+struct Foo {
+  int x;
+};
+
+struct Foo dest;
+
+// CHECK-LABEL: define void @get_struct
+// CHECK: [[RESULT:%[a-z_0-9]+]] = va_arg {{.*}}, %struct.Foo*{{$}}
+// CHECK: [[RESULT2:%[a-z_0-9]+]] = bitcast {{.*}} [[RESULT]] to i8*
+// CHECK: call void @llvm.memcpy{{.*}}@dest{{.*}}, i8* [[RESULT2]]
+void get_struct(va_list *args) {
+ dest = va_arg(*args, struct Foo);
+}
+
+enum E { Foo_one = 1 };
+
+enum E enum_dest;
+
+// CHECK-LABEL: define void @get_enum
+// CHECK: va_arg i8** {{.*}}, i32
+void get_enum(va_list *args) {
+  enum_dest = va_arg(*args, enum E);
+}
diff --git a/test/CodeGen/sparcv8-abi.c b/test/CodeGen/sparcv8-abi.c
new file mode 100644
index 0000000000000..cd8832f6534b8
--- /dev/null
+++ b/test/CodeGen/sparcv8-abi.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple sparc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define { float, float } @p({ float, float }* byval align 4 %a, { float, float }* byval align 4 %b) #0 {
+float __complex__
+p (float __complex__  a, float __complex__  b)
+{
+}
+
+// CHECK-LABEL: define { double, double } @q({ double, double }* byval align 8 %a, { double, double }* byval align 8 %b) #0 {
+double __complex__
+q (double __complex__  a, double __complex__  b)
+{
+}
+
+// CHECK-LABEL: define { i64, i64 } @r({ i64, i64 }* byval align 8 %a, { i64, i64 }* byval align 8 %b) #0 {
+long long __complex__
+r (long long __complex__  a, long long __complex__  b)
+{
+}
diff --git a/test/CodeGen/sret.c b/test/CodeGen/sret.c
index 5f0d07404a169..201e3b87f8b94 100644
--- a/test/CodeGen/sret.c
+++ b/test/CodeGen/sret.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | grep sret | count 5
+// RUN: %clang_cc1 %s -emit-llvm -o - | grep sret | grep -v 'sret.c' | count 4
 
 struct abc {
  long a;
diff --git a/test/CodeGen/sret2.c b/test/CodeGen/sret2.c
index d103d87610fd8..0e254ed914a48 100644
--- a/test/CodeGen/sret2.c
+++ b/test/CodeGen/sret2.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | grep sret | count 2
+// RUN: %clang_cc1 %s -emit-llvm -o - | grep sret | grep -v 'sret2.c' | count 1
 
 struct abc {
  long a;
diff --git a/test/CodeGen/sse-builtins.c b/test/CodeGen/sse-builtins.c
index 0f964e8055997..a6c5c1a0a166b 100644
--- a/test/CodeGen/sse-builtins.c
+++ b/test/CodeGen/sse-builtins.c
@@ -1,187 +1,216 @@
-// RUN: %clang_cc1 -ffreestanding -triple x86_64-apple-macosx10.8.0 -target-feature +sse4.1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Werror | FileCheck %s
 
-#include <xmmintrin.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
+// Don't include mm_malloc.h, it's system specific.
+#define __MM_MALLOC_H
 
-__m128 test_rsqrt_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_rsqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_rsqrt_ss(x);
+#include <x86intrin.h>
+
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+
+__m128 test_mm_add_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_add_ps
+  // CHECK: fadd <4 x float>
+  return _mm_add_ps(A, B);
 }
 
-__m128 test_rcp_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_rcp_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_rcp_ss(x);
+__m128 test_mm_add_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_add_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fadd float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_add_ss(A, B);
 }
 
-__m128 test_sqrt_ss(__m128 x) {
-  // CHECK: define {{.*}} @test_sqrt_ss
-  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ss
-  // CHECK: extractelement <4 x float> {{.*}}, i32 0
-  // CHECK: extractelement <4 x float> {{.*}}, i32 1
-  // CHECK: extractelement <4 x float> {{.*}}, i32 2
-  // CHECK: extractelement <4 x float> {{.*}}, i32 3
-  return _mm_sqrt_ss(x);
+__m128 test_mm_and_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_and_ps
+  // CHECK: and <4 x i32>
+  return _mm_and_ps(A, B);
 }
 
-__m128 test_loadl_pi(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadl_pi
-  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-  return _mm_loadl_pi(x,y);
+__m128 test_mm_andnot_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_andnot_ps
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32>
+  return _mm_andnot_ps(A, B);
 }
 
-__m128 test_loadh_pi(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadh_pi
-  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
-  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-  return _mm_loadh_pi(x,y);
+__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_ps
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpeq_ps(__a, __b);
 }
 
-__m128 test_load_ss(void* y) {
-  // CHECK: define {{.*}} @test_load_ss
-  // CHECK: load float, float* {{.*}}, align 1{{$}}
-  return _mm_load_ss(y);
+__m128 test_mm_cmpeq_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpeq_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
+  return _mm_cmpeq_ss(__a, __b);
 }
 
-__m128 test_load1_ps(void* y) {
-  // CHECK: define {{.*}} @test_load1_ps
-  // CHECK: load float, float* {{.*}}, align 1{{$}}
-  return _mm_load1_ps(y);
+__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpge_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpge_ps(__a, __b);
 }
 
-void test_store_ss(__m128 x, void* y) {
-  // CHECK-LABEL: define void @test_store_ss
-  // CHECK: store {{.*}} float* {{.*}}, align 1{{$}}
-  _mm_store_ss(y, x);
+__m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpge_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpge_ss(__a, __b);
 }
 
-__m128d test_load1_pd(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_load1_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_load1_pd(y);
+__m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpgt_ps(__a, __b);
 }
 
-__m128d test_loadr_pd(__m128 x, void* y) {
-  // CHECK: define {{.*}} @test_loadr_pd
-  // CHECK: load <2 x double>, <2 x double>* {{.*}}, align 16{{$}}
-  return _mm_loadr_pd(y);
+__m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpgt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpgt_ss(__a, __b);
 }
 
-__m128d test_load_sd(void* y) {
-  // CHECK: define {{.*}} @test_load_sd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_load_sd(y);
+__m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmple_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ole <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmple_ps(__a, __b);
 }
 
-__m128d test_loadh_pd(__m128d x, void* y) {
-  // CHECK: define {{.*}} @test_loadh_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_loadh_pd(x, y);
+__m128 test_mm_cmple_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmple_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
+  return _mm_cmple_ss(__a, __b);
 }
 
-__m128d test_loadl_pd(__m128d x, void* y) {
-  // CHECK: define {{.*}} @test_loadl_pd
-  // CHECK: load double, double* {{.*}}, align 1{{$}}
-  return _mm_loadl_pd(x, y);
+__m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmplt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp olt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmplt_ps(__a, __b);
 }
 
-void test_store_sd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_store_sd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_store_sd(y, x);
+__m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmplt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
+  return _mm_cmplt_ss(__a, __b);
 }
 
-void test_store1_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_store1_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_store1_pd(y, x);
+__m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpneq_ps
+  // CHECK:         [[CMP:%.*]] = fcmp une <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpneq_ps(__a, __b);
 }
 
-void test_storer_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storer_pd
-  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
-  _mm_storer_pd(y, x);
+__m128 test_mm_cmpneq_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpneq_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
+  return _mm_cmpneq_ss(__a, __b);
 }
 
-void test_storeh_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storeh_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_storeh_pd(y, x);
+__m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnge_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnge_ps(__a, __b);
 }
 
-void test_storel_pd(__m128d x, void* y) {
-  // CHECK-LABEL: define void @test_storel_pd
-  // CHECK: store {{.*}} double* {{.*}}, align 1{{$}}
-  _mm_storel_pd(y, x);
+__m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnge_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpnge_ss(__a, __b);
 }
 
-__m128i test_loadl_epi64(void* y) {
-  // CHECK: define {{.*}} @test_loadl_epi64
-  // CHECK: load i64, i64* {{.*}}, align 1{{$}}
-  return _mm_loadl_epi64(y);
+__m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpngt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpngt_ps(__a, __b);
 }
 
-void test_storel_epi64(__m128i x, void* y) {
-  // CHECK-LABEL: define void @test_storel_epi64
-  // CHECK: store {{.*}} i64* {{.*}}, align 1{{$}}
-  _mm_storel_epi64(y, x);
+__m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpngt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_cmpngt_ss(__a, __b);
 }
 
-void test_stream_si32(int x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si32
-  // CHECK: store {{.*}} i32* {{.*}}, align 1, !nontemporal
-  _mm_stream_si32(y, x);
+__m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnle_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnle_ps(__a, __b);
 }
 
-void test_stream_si64(long long x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si64
-  // CHECK: store {{.*}} i64* {{.*}}, align 1, !nontemporal
-  _mm_stream_si64(y, x);
+__m128 test_mm_cmpnle_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnle_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
+  return _mm_cmpnle_ss(__a, __b);
 }
 
-void test_stream_si128(__m128i x, void *y) {
-  // CHECK-LABEL: define void @test_stream_si128
-  // CHECK: store {{.*}} <2 x i64>* {{.*}}, align 16, !nontemporal
-  _mm_stream_si128(y, x);
+__m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnlt_ps
+  // CHECK:         [[CMP:%.*]] = fcmp uge <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpnlt_ps(__a, __b);
 }
 
-void test_extract_epi16(__m128i __a) {
-  // CHECK-LABEL: define void @test_extract_epi16
-  // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
-  // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
-  _mm_extract_epi16(__a, 8);
+__m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpnlt_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
+  return _mm_cmpnlt_ss(__a, __b);
 }
 
-__m128 test_mm_cmpeq_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
-  return _mm_cmpeq_ss(__a, __b);
+__m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpord_ps
+  // CHECK:         [[CMP:%.*]] = fcmp ord <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpord_ps(__a, __b);
 }
 
-__m128 test_mm_cmplt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmplt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmplt_ss(__a, __b);
+__m128 test_mm_cmpord_ss(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpord_ss
+  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
+  return _mm_cmpord_ss(__a, __b);
 }
 
-__m128 test_mm_cmple_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmple_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmple_ss(__a, __b);
+__m128 test_mm_cmpunord_ps(__m128 __a, __m128 __b) {
+  // CHECK-LABEL: @test_mm_cmpunord_ps
+  // CHECK:         [[CMP:%.*]] = fcmp uno <4 x float>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <4 x i1> [[CMP]] to <4 x i32>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <4 x i32> [[SEXT]] to <4 x float>
+  // CHECK-NEXT:    ret <4 x float> [[BC]]
+  return _mm_cmpunord_ps(__a, __b);
 }
 
 __m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
@@ -190,332 +219,612 @@ __m128 test_mm_cmpunord_ss(__m128 __a, __m128 __b) {
   return _mm_cmpunord_ss(__a, __b);
 }
 
-__m128 test_mm_cmpneq_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
-  return _mm_cmpneq_ss(__a, __b);
+int test_mm_comieq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comieq_ss
+  // CHECK: call i32 @llvm.x86.sse.comieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comieq_ss(A, B);
 }
 
-__m128 test_mm_cmpnlt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpnlt_ss(__a, __b);
+int test_mm_comige_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comige_ss
+  // CHECK: call i32 @llvm.x86.sse.comige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comige_ss(A, B);
 }
 
-__m128 test_mm_cmpnle_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnle_ss(__a, __b);
+int test_mm_comigt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comigt_ss
+  // CHECK: call i32 @llvm.x86.sse.comigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comigt_ss(A, B);
 }
 
-__m128 test_mm_cmpord_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpord_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
-  return _mm_cmpord_ss(__a, __b);
+int test_mm_comile_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comile_ss
+  // CHECK: call i32 @llvm.x86.sse.comile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comile_ss(A, B);
 }
 
-__m128 test_mm_cmpgt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmpgt_ss(__a, __b);
+int test_mm_comilt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comilt_ss
+  // CHECK: call i32 @llvm.x86.sse.comilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comilt_ss(A, B);
 }
 
-__m128 test_mm_cmpge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmpge_ss(__a, __b);
+int test_mm_comineq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_comineq_ss
+  // CHECK: call i32 @llvm.x86.sse.comineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_comineq_ss(A, B);
 }
 
-__m128 test_mm_cmpngt_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpngt_ss(__a, __b);
+int test_mm_cvt_ss2si(__m128 A) {
+  // CHECK-LABEL: test_mm_cvt_ss2si
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvt_ss2si(A);
 }
 
-__m128 test_mm_cmpnge_ss(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_ss
-  // CHECK: @llvm.x86.sse.cmp.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnge_ss(__a, __b);
+__m128 test_mm_cvtsi32_ss(__m128 A, int B) {
+  // CHECK-LABEL: test_mm_cvtsi32_ss
+  // CHECK: sitofp i32 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvtsi32_ss(A, B);
 }
 
-__m128 test_mm_cmpeq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 0)
-  return _mm_cmpeq_ps(__a, __b);
+__m128 test_mm_cvtsi64_ss(__m128 A, long long B) {
+  // CHECK-LABEL: test_mm_cvtsi64_ss
+  // CHECK: sitofp i64 %{{.*}} to float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_cvtsi64_ss(A, B);
 }
 
-__m128 test_mm_cmplt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmplt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmplt_ps(__a, __b);
+float test_mm_cvtss_f32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_f32
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  return _mm_cvtss_f32(A);
 }
 
-__m128 test_mm_cmple_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmple_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmple_ps(__a, __b);
+int test_mm_cvtss_si32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_si32
+  // CHECK: call i32 @llvm.x86.sse.cvtss2si(<4 x float> %{{.*}})
+  return _mm_cvtss_si32(A);
 }
 
-__m128 test_mm_cmpunord_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 3)
-  return _mm_cmpunord_ps(__a, __b);
+long long test_mm_cvtss_si64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtss_si64
+  // CHECK: call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %{{.*}})
+  return _mm_cvtss_si64(A);
 }
 
-__m128 test_mm_cmpneq_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
-  return _mm_cmpneq_ps(__a, __b);
+int test_mm_cvtt_ss2si(__m128 A) {
+  // CHECK-LABEL: test_mm_cvtt_ss2si
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fptosi float %{{.*}} to i32
+  return _mm_cvtt_ss2si(A);
 }
 
-__m128 test_mm_cmpnlt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpnlt_ps(__a, __b);
+int test_mm_cvttss_si32(__m128 A) {
+  // CHECK-LABEL: test_mm_cvttss_si32
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fptosi float %{{.*}} to i32
+  return _mm_cvttss_si32(A);
 }
 
-__m128 test_mm_cmpnle_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnle_ps(__a, __b);
+long long test_mm_cvttss_si64(__m128 A) {
+  // CHECK-LABEL: test_mm_cvttss_si64
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fptosi float %{{.*}} to i64
+  return _mm_cvttss_si64(A);
 }
 
-__m128 test_mm_cmpord_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpord_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 7)
-  return _mm_cmpord_ps(__a, __b);
+__m128 test_mm_div_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_div_ps
+  // CHECK: fdiv <4 x float>
+  return _mm_div_ps(A, B);
 }
 
-__m128 test_mm_cmpgt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 1)
-  return _mm_cmpgt_ps(__a, __b);
+__m128 test_mm_div_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_div_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fdiv float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_div_ss(A, B);
 }
 
-__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpge_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2)
-  return _mm_cmpge_ps(__a, __b);
+unsigned int test_MM_GET_EXCEPTION_MASK() {
+  // CHECK-LABEL: test_MM_GET_EXCEPTION_MASK
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 8064
+  return _MM_GET_EXCEPTION_MASK();
 }
 
-__m128 test_mm_cmpngt_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 5)
-  return _mm_cmpngt_ps(__a, __b);
+unsigned int test_MM_GET_EXCEPTION_STATE() {
+  // CHECK-LABEL: test_MM_GET_EXCEPTION_STATE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 63
+  return _MM_GET_EXCEPTION_STATE();
 }
 
-__m128 test_mm_cmpnge_ps(__m128 __a, __m128 __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_ps
-  // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 6)
-  return _mm_cmpnge_ps(__a, __b);
+unsigned int test_MM_GET_FLUSH_ZERO_MODE() {
+  // CHECK-LABEL: test_MM_GET_FLUSH_ZERO_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 32768
+  return _MM_GET_FLUSH_ZERO_MODE();
+}
+
+unsigned int test_MM_GET_ROUNDING_MODE() {
+  // CHECK-LABEL: test_MM_GET_ROUNDING_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: and i32 %{{.*}}, 24576
+  return _MM_GET_ROUNDING_MODE();
+}
+
+unsigned int test_mm_getcsr() {
+  // CHECK-LABEL: test_mm_getcsr
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* %{{.*}})
+  // CHECK: load i32
+  return _mm_getcsr();
+}
+
+__m128 test_mm_load_ps(float* y) {
+  // CHECK-LABEL: test_mm_load_ps
+  // CHECK: load <4 x float>, <4 x float>* {{.*}}, align 16
+  return _mm_load_ps(y);
+}
+
+__m128 test_mm_load_ps1(float* y) {
+  // CHECK-LABEL: test_mm_load_ps1
+  // CHECK: load float, float* %{{.*}}, align 4
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_load_ps1(y);
+}
+
+__m128 test_mm_load_ss(float* y) {
+  // CHECK-LABEL: test_mm_load_ss
+  // CHECK: load float, float* {{.*}}, align 1{{$}}
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float 0.000000e+00, i32 3
+  return _mm_load_ss(y);
+}
+
+__m128 test_mm_load1_ps(float* y) {
+  // CHECK-LABEL: test_mm_load1_ps
+  // CHECK: load float, float* %{{.*}}, align 4
+  // CHECK: insertelement <4 x float> undef, float %{{.*}}, i32 0
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 1
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 2
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 3
+  return _mm_load1_ps(y);
+}
+
+__m128 test_mm_loadh_pi(__m128 x, __m64* y) {
+  // CHECK-LABEL: test_mm_loadh_pi
+  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm_loadh_pi(x,y);
 }
 
-__m128d test_mm_cmpeq_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
-  return _mm_cmpeq_sd(__a, __b);
+__m128 test_mm_loadl_pi(__m128 x, __m64* y) {
+  // CHECK-LABEL: test_mm_loadl_pi
+  // CHECK: load <2 x float>, <2 x float>* {{.*}}, align 1{{$}}
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 0, i32 1
+  // CHECK: shufflevector {{.*}} <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+  return _mm_loadl_pi(x,y);
 }
 
-__m128d test_mm_cmplt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmplt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmplt_sd(__a, __b);
+__m128 test_mm_loadr_ps(float* A) {
+  // CHECK-LABEL: test_mm_loadr_ps
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 16
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  return _mm_loadr_ps(A);
 }
 
-__m128d test_mm_cmple_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmple_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmple_sd(__a, __b);
+__m128 test_mm_loadu_ps(float* A) {
+  // CHECK-LABEL: test_mm_loadu_ps
+  // CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
+  return _mm_loadu_ps(A);
 }
 
-__m128d test_mm_cmpunord_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
-  return _mm_cmpunord_sd(__a, __b);
+__m128 test_mm_max_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_max_ps
+  // CHECK: @llvm.x86.sse.max.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_max_ps(A, B);
 }
 
-__m128d test_mm_cmpneq_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
-  return _mm_cmpneq_sd(__a, __b);
+__m128 test_mm_max_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_max_ss
+  // CHECK: @llvm.x86.sse.max.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_max_ss(A, B);
 }
 
-__m128d test_mm_cmpnlt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpnlt_sd(__a, __b);
+__m128 test_mm_min_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_min_ps
+  // CHECK: @llvm.x86.sse.min.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_min_ps(A, B);
 }
 
-__m128d test_mm_cmpnle_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnle_sd(__a, __b);
+__m128 test_mm_min_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_min_ss
+  // CHECK: @llvm.x86.sse.min.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_min_ss(A, B);
 }
 
-__m128d test_mm_cmpord_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpord_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
-  return _mm_cmpord_sd(__a, __b);
+__m128 test_mm_move_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_move_ss
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
+  return _mm_move_ss(A, B);
 }
 
-__m128d test_mm_cmpgt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmpgt_sd(__a, __b);
+__m128 test_mm_movehl_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_movehl_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  return _mm_movehl_ps(A, B);
 }
 
-__m128d test_mm_cmpge_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpge_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmpge_sd(__a, __b);
+__m128 test_mm_movelh_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_movelh_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  return _mm_movelh_ps(A, B);
 }
 
-__m128d test_mm_cmpngt_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpngt_sd(__a, __b);
+int test_mm_movemask_ps(__m128 A) {
+  // CHECK-LABEL: test_mm_movemask_ps
+  // CHECK: call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
+  return _mm_movemask_ps(A);
 }
 
-__m128d test_mm_cmpnge_sd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_sd
-  // CHECK: @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnge_sd(__a, __b);
+__m128 test_mm_mul_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_mul_ps
+  // CHECK: fmul <4 x float>
+  return _mm_mul_ps(A, B);
 }
 
-__m128d test_mm_cmpeq_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpeq_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
-  return _mm_cmpeq_pd(__a, __b);
+__m128 test_mm_mul_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_mul_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fmul float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_mul_ss(A, B);
 }
 
-__m128d test_mm_cmplt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmplt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmplt_pd(__a, __b);
+__m128 test_mm_or_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_or_ps
+  // CHECK: or <4 x i32>
+  return _mm_or_ps(A, B);
 }
 
-__m128d test_mm_cmple_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmple_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmple_pd(__a, __b);
+void test_mm_prefetch(char const* p) {
+  // CHECK-LABEL: test_mm_prefetch
+  // CHECK: call void @llvm.prefetch(i8* {{.*}}, i32 0, i32 0, i32 1)
+  _mm_prefetch(p, 0);
 }
 
-__m128d test_mm_cmpunord_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpunord_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
-  return _mm_cmpunord_pd(__a, __b);
+__m128 test_mm_rcp_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_rcp_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> {{.*}})
+  return _mm_rcp_ps(x);
 }
 
-__m128d test_mm_cmpneq_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpneq_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
-  return _mm_cmpneq_pd(__a, __b);
+__m128 test_mm_rcp_ss(__m128 x) {
+  // CHECK-LABEL: test_mm_rcp_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> {{.*}})
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_rcp_ss(x);
 }
 
-__m128d test_mm_cmpnlt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnlt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpnlt_pd(__a, __b);
+__m128 test_mm_rsqrt_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_rsqrt_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> {{.*}})
+  return _mm_rsqrt_ps(x);
 }
 
-__m128d test_mm_cmpnle_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnle_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnle_pd(__a, __b);
+__m128 test_mm_rsqrt_ss(__m128 x) {
+  // CHECK-LABEL: test_mm_rsqrt_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> {{.*}})
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_rsqrt_ss(x);
 }
 
-__m128d test_mm_cmpord_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpord_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
-  return _mm_cmpord_pd(__a, __b);
+void test_MM_SET_EXCEPTION_MASK(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_EXCEPTION_MASK
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -8065
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_EXCEPTION_MASK(A);
+}
+
+void test_MM_SET_EXCEPTION_STATE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_EXCEPTION_STATE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -64
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_EXCEPTION_STATE(A);
 }
 
-__m128d test_mm_cmpgt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpgt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
-  return _mm_cmpgt_pd(__a, __b);
+void test_MM_SET_FLUSH_ZERO_MODE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_FLUSH_ZERO_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -32769
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_FLUSH_ZERO_MODE(A);
 }
 
-__m128d test_mm_cmpge_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpge_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
-  return _mm_cmpge_pd(__a, __b);
+__m128 test_mm_set_ps(float A, float B, float C, float D) {
+  // CHECK-LABEL: test_mm_set_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set_ps(A, B, C, D);
 }
 
-__m128d test_mm_cmpngt_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpngt_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
-  return _mm_cmpngt_pd(__a, __b);
+__m128 test_mm_set_ps1(float A) {
+  // CHECK-LABEL: test_mm_set_ps1
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set_ps1(A);
 }
 
-__m128d test_mm_cmpnge_pd(__m128d __a, __m128d __b) {
-  // CHECK-LABEL: @test_mm_cmpnge_pd
-  // CHECK: @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
-  return _mm_cmpnge_pd(__a, __b);
+void test_MM_SET_ROUNDING_MODE(unsigned int A) {
+  // CHECK-LABEL: test_MM_SET_ROUNDING_MODE
+  // CHECK: call void @llvm.x86.sse.stmxcsr(i8* {{.*}})
+  // CHECK: load i32
+  // CHECK: and i32 {{.*}}, -24577
+  // CHECK: or i32
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _MM_SET_ROUNDING_MODE(A);
 }
 
-__m128 test_mm_slli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_slli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
-  return _mm_slli_si128(a, 5);
+__m128 test_mm_set_ss(float A) {
+  // CHECK-LABEL: test_mm_set_ss
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float 0.000000e+00, i32 3
+  return _mm_set_ss(A);
 }
 
-__m128 test_mm_bslli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_bslli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
-  return _mm_bslli_si128(a, 5);
+__m128 test_mm_set1_ps(float A) {
+  // CHECK-LABEL: test_mm_set1_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_set1_ps(A);
 }
 
-__m128 test_mm_srli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_srli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  return _mm_srli_si128(a, 5);
+void test_mm_setcsr(unsigned int A) {
+  // CHECK-LABEL: test_mm_setcsr
+  // CHECK: store i32
+  // CHECK: call void @llvm.x86.sse.ldmxcsr(i8* {{.*}})
+  _mm_setcsr(A);
 }
 
-__m128 test_mm_bsrli_si128(__m128 a) {
-  // CHECK-LABEL: @test_mm_bsrli_si128
-  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-  return _mm_bsrli_si128(a, 5);
+__m128 test_mm_setr_ps(float A, float B, float C, float D) {
+  // CHECK-LABEL: test_mm_setr_ps
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_setr_ps(A, B, C, D);
 }
 
-__m128 test_mm_undefined_ps() {
-  // CHECK-LABEL: @test_mm_undefined_ps
-  // CHECK: ret <4 x float> undef
-  return _mm_undefined_ps();
+__m128 test_mm_setzero_ps() {
+  // CHECK-LABEL: test_mm_setzero_ps
+  // CHECK: store <4 x float> zeroinitializer
+  return _mm_setzero_ps();
+}
+
+void test_mm_sfence() {
+  // CHECK-LABEL: test_mm_sfence
+  // CHECK: call void @llvm.x86.sse.sfence()
+  _mm_sfence();
+}
+
+__m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_shuffle_ps
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
+  return _mm_shuffle_ps(A, B, 0);
+}
+
+__m128 test_mm_sqrt_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_sqrt_ps
+  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> {{.*}})
+  return _mm_sqrt_ps(x);
+}
+
+__m128 test_sqrt_ss(__m128 x) {
+  // CHECK: define {{.*}} @test_sqrt_ss
+  // CHECK: call <4 x float> @llvm.x86.sse.sqrt.ss
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: insertelement <4 x float> undef, float {{.*}}, i32 0
+  // CHECK: extractelement <4 x float> {{.*}}, i32 1
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 1
+  // CHECK: extractelement <4 x float> {{.*}}, i32 2
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 2
+  // CHECK: extractelement <4 x float> {{.*}}, i32 3
+  // CHECK: insertelement <4 x float> {{.*}}, float {{.*}}, i32 3
+  return _mm_sqrt_ss(x);
+}
+
+void test_mm_store_ps(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* {{.*}}, align 16
+  _mm_store_ps(x, y);
 }
 
-__m128d test_mm_undefined_pd() {
-  // CHECK-LABEL: @test_mm_undefined_pd
-  // CHECK: ret <2 x double> undef
-  return _mm_undefined_pd();
+void test_mm_store_ps1(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ps1
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16
+  _mm_store_ps1(x, y);
 }
 
-__m128i test_mm_undefined_si128() {
-  // CHECK-LABEL: @test_mm_undefined_si128
-  // CHECK: ret <2 x i64> undef
-  return _mm_undefined_si128();
+void test_mm_store_ss(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store_ss
+  // CHECK: extractelement <4 x float> {{.*}}, i32 0
+  // CHECK: store float %{{.*}}, float* {{.*}}, align 1{{$}}
+  _mm_store_ss(x, y);
+}
+
+void test_mm_store1_ps(float* x, __m128 y) {
+  // CHECK-LABEL: test_mm_store1_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> zeroinitializer
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16
+  _mm_store1_ps(x, y);
+}
+
+void test_mm_storeh_pi(__m64* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storeh_pi
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  // CHECK: extractelement <2 x i64> %{{.*}}, i64 1
+  // CHECK: store i64 %{{.*}}, i64* {{.*}}
+  _mm_storeh_pi(x, y);
+}
+
+void test_mm_storel_pi(__m64* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storel_pi
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  // CHECK: extractelement <2 x i64> %{{.*}}, i64 0
+  // CHECK: store i64 %{{.*}}, i64* {{.*}}
+  _mm_storel_pi(x, y);
+}
+
+void test_mm_storer_ps(float* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storer_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* {{.*}}, align 16
+  _mm_storer_ps(x, y);
 }
 
-__m64 test_mm_add_si64(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_add_si64
-  // CHECK @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_add_si64(__a, __b);
+void test_mm_storeu_ps(float* x,  __m128 y) {
+  // CHECK-LABEL: test_mm_storeu_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_ps(x, y);
+}
+
+void test_mm_stream_ps(float*A, __m128d B) {
+  // CHECK-LABEL: test_mm_stream_ps
+  // CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 16, !nontemporal
+  _mm_stream_ps(A, B);
+}
+
+__m128 test_mm_sub_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_sub_ps
+  // CHECK: fsub <4 x float>
+  return _mm_sub_ps(A, B);
+}
+
+__m128 test_mm_sub_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_sub_ss
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 0
+  // CHECK: fsub float
+  // CHECK: insertelement <4 x float> %{{.*}}, float %{{.*}}, i32 0
+  return _mm_sub_ss(A, B);
+}
+
+void test_MM_TRANSPOSE4_PS(__m128 *A, __m128 *B, __m128 *C, __m128 *D) {
+  // CHECK-LABEL: test_MM_TRANSPOSE4_PS
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  // CHECK: shufflevector <4 x float> {{.*}}, <4 x float> {{.*}}, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
+  _MM_TRANSPOSE4_PS(*A, *B, *C, *D);
+}
+
+int test_mm_ucomieq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomieq_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomieq_ss(A, B);
+}
+
+int test_mm_ucomige_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomige_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomige_ss(A, B);
+}
+
+int test_mm_ucomigt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomigt_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomigt_ss(A, B);
+}
+
+int test_mm_ucomile_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomile_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomile_ss(A, B);
+}
+
+int test_mm_ucomilt_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomilt_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomilt_ss(A, B);
+}
+
+int test_mm_ucomineq_ss(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_ucomineq_ss
+  // CHECK: call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  return _mm_ucomineq_ss(A, B);
+}
+
+__m128 test_mm_undefined_ps() {
+  // CHECK-LABEL: @test_mm_undefined_ps
+  // CHECK: ret <4 x float> undef
+  return _mm_undefined_ps();
 }
 
-__m64 test_mm_sub_si64(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_sub_si64
-  // CHECK @llvm.x86.mmx.psub.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_sub_si64(__a, __b);
+__m128 test_mm_unpackhi_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_unpackhi_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+  return _mm_unpackhi_ps(A, B);
 }
 
-__m64 test_mm_mul_su32(__m64 __a, __m64 __b) {
-  // CHECK-LABEL: @test_mm_mul_su32
-  // CHECK @llvm.x86.mmx.pmulu.dq(x86_mmx %{{.*}}, x86_mmx %{{.*}})
-  return _mm_mul_su32(__a, __b);
+__m128 test_mm_unpacklo_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_unpacklo_ps
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  return _mm_unpacklo_ps(A, B);
 }
 
-void test_mm_pause() {
-  // CHECK-LABEL: @test_mm_pause
-  // CHECK @llvm.x86.sse2.pause()
-  return _mm_pause();
+__m128 test_mm_xor_ps(__m128 A, __m128 B) {
+  // CHECK-LABEL: test_mm_xor_ps
+  // CHECK: xor <4 x i32>
+  return _mm_xor_ps(A, B);
 }
diff --git a/test/CodeGen/sse2-builtins.c b/test/CodeGen/sse2-builtins.c
index 4ceb93abfc2fa..b340d1a27553c 100644
--- a/test/CodeGen/sse2-builtins.c
+++ b/test/CodeGen/sse2-builtins.c
@@ -6,6 +6,8 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+
 __m128i test_mm_add_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_add_epi8
   // CHECK: add <16 x i8>
@@ -38,31 +40,34 @@ __m128d test_mm_add_pd(__m128d A, __m128d B) {
 
 __m128d test_mm_add_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_add_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fadd double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_add_sd(A, B);
 }
 
 __m128i test_mm_adds_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epi8(A, B);
 }
 
 __m128i test_mm_adds_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epi16(A, B);
 }
 
 __m128i test_mm_adds_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_adds_epu8(A, B);
 }
 
 __m128i test_mm_adds_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_adds_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_adds_epu16(A, B);
 }
 
@@ -78,15 +83,29 @@ __m128i test_mm_and_si128(__m128i A, __m128i B) {
   return _mm_and_si128(A, B);
 }
 
+__m128d test_mm_andnot_pd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_andnot_pd
+  // CHECK: xor <4 x i32> %{{.*}}, <i32 -1, i32 -1, i32 -1, i32 -1>
+  // CHECK: and <4 x i32>
+  return _mm_andnot_pd(A, B);
+}
+
+__m128i test_mm_andnot_si128(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm_andnot_si128
+  // CHECK: xor <2 x i64> %{{.*}}, <i64 -1, i64 -1>
+  // CHECK: and <2 x i64>
+  return _mm_andnot_si128(A, B);
+}
+
 __m128i test_mm_avg_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_avg_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_avg_epu8(A, B);
 }
 
 __m128i test_mm_avg_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_avg_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_avg_epu16(A, B);
 }
 
@@ -102,6 +121,42 @@ __m128i test_mm_bsrli_si128(__m128i A) {
   return _mm_bsrli_si128(A, 5);
 }
 
+__m128 test_mm_castpd_ps(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_ps
+  // CHECK: bitcast <2 x double> %{{.*}} to <4 x float>
+  return _mm_castpd_ps(A);
+}
+
+__m128i test_mm_castpd_si128(__m128d A) {
+  // CHECK-LABEL: test_mm_castpd_si128
+  // CHECK: bitcast <2 x double> %{{.*}} to <2 x i64>
+  return _mm_castpd_si128(A);
+}
+
+__m128d test_mm_castps_pd(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_pd
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x double>
+  return _mm_castps_pd(A);
+}
+
+__m128i test_mm_castps_si128(__m128 A) {
+  // CHECK-LABEL: test_mm_castps_si128
+  // CHECK: bitcast <4 x float> %{{.*}} to <2 x i64>
+  return _mm_castps_si128(A);
+}
+
+__m128d test_mm_castsi128_pd(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_pd
+  // CHECK: bitcast <2 x i64> %{{.*}} to <2 x double>
+  return _mm_castsi128_pd(A);
+}
+
+__m128 test_mm_castsi128_ps(__m128i A) {
+  // CHECK-LABEL: test_mm_castsi128_ps
+  // CHECK: bitcast <2 x i64> %{{.*}} to <4 x float>
+  return _mm_castsi128_ps(A);
+}
+
 void test_mm_clflush(void* A) {
   // CHECK-LABEL: test_mm_clflush
   // CHECK: call void @llvm.x86.sse2.clflush(i8* %{{.*}})
@@ -128,7 +183,10 @@ __m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
 
 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpeq_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 0)
+  // CHECK:         [[CMP:%.*]] = fcmp oeq <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpeq_pd(A, B);
 }
 
@@ -140,13 +198,20 @@ __m128d test_mm_cmpeq_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpge_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpge_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpge_pd(A, B);
 }
 
 __m128d test_mm_cmpge_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpge_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpge_sd(A, B);
 }
 
@@ -170,19 +235,29 @@ __m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
 
 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpgt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpgt_pd(A, B);
 }
 
 __m128d test_mm_cmpgt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpgt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpgt_sd(A, B);
 }
 
 __m128d test_mm_cmple_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmple_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 2)
+  // CHECK:         [[CMP:%.*]] = fcmp ole <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmple_pd(A, B);
 }
 
@@ -212,7 +287,10 @@ __m128i test_mm_cmplt_epi32(__m128i A, __m128i B) {
 
 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmplt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 1)
+  // CHECK:         [[CMP:%.*]] = fcmp olt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmplt_pd(A, B);
 }
 
@@ -224,7 +302,10 @@ __m128d test_mm_cmplt_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpneq_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpneq_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 4)
+  // CHECK:         [[CMP:%.*]] = fcmp une <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpneq_pd(A, B);
 }
 
@@ -236,31 +317,48 @@ __m128d test_mm_cmpneq_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpnge_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnge_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnge_pd(A, B);
 }
 
 __m128d test_mm_cmpnge_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnge_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpnge_sd(A, B);
 }
 
 __m128d test_mm_cmpngt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpngt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpngt_pd(A, B);
 }
 
 __m128d test_mm_cmpngt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpngt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_cmpngt_sd(A, B);
 }
 
 __m128d test_mm_cmpnle_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnle_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 6)
+  // CHECK:         [[CMP:%.*]] = fcmp ugt <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnle_pd(A, B);
 }
 
@@ -272,7 +370,10 @@ __m128d test_mm_cmpnle_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpnlt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpnlt_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 5)
+  // CHECK:         [[CMP:%.*]] = fcmp uge <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpnlt_pd(A, B);
 }
 
@@ -284,7 +385,10 @@ __m128d test_mm_cmpnlt_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpord_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpord_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 7)
+  // CHECK:         [[CMP:%.*]] = fcmp ord <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpord_pd(A, B);
 }
 
@@ -296,7 +400,10 @@ __m128d test_mm_cmpord_sd(__m128d A, __m128d B) {
 
 __m128d test_mm_cmpunord_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpunord_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 3)
+  // CHECK:         [[CMP:%.*]] = fcmp uno <2 x double>
+  // CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i64>
+  // CHECK-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[SEXT]] to <2 x double>
+  // CHECK-NEXT:    ret <2 x double> [[BC]]
   return _mm_cmpunord_pd(A, B);
 }
 
@@ -308,73 +415,75 @@ __m128d test_mm_cmpunord_sd(__m128d A, __m128d B) {
 
 int test_mm_comieq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comieq.sd
+  // CHECK: call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comieq_sd(A, B);
 }
 
 int test_mm_comige_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comige_sd
-  // CHECK: call i32 @llvm.x86.sse2.comige.sd
+  // CHECK: call i32 @llvm.x86.sse2.comige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comige_sd(A, B);
 }
 
 int test_mm_comigt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comigt.sd
+  // CHECK: call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comigt_sd(A, B);
 }
 
 int test_mm_comile_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comile_sd
-  // CHECK: call i32 @llvm.x86.sse2.comile.sd
+  // CHECK: call i32 @llvm.x86.sse2.comile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comile_sd(A, B);
 }
 
 int test_mm_comilt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.comilt.sd
+  // CHECK: call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comilt_sd(A, B);
 }
 
 int test_mm_comineq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_comineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.comineq.sd
+  // CHECK: call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_comineq_sd(A, B);
 }
 
 __m128d test_mm_cvtepi32_pd(__m128i A) {
   // CHECK-LABEL: test_mm_cvtepi32_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cvtdq2pd
+  // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: sitofp <2 x i32> %{{.*}} to <2 x double>
   return _mm_cvtepi32_pd(A);
 }
 
 __m128 test_mm_cvtepi32_ps(__m128i A) {
   // CHECK-LABEL: test_mm_cvtepi32_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps
+  // CHECK: call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %{{.*}})
   return _mm_cvtepi32_ps(A);
 }
 
 __m128i test_mm_cvtpd_epi32(__m128d A) {
   // CHECK-LABEL: test_mm_cvtpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %{{.*}})
   return _mm_cvtpd_epi32(A);
 }
 
 __m128 test_mm_cvtpd_ps(__m128d A) {
   // CHECK-LABEL: test_mm_cvtpd_ps
-  // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps
+  // CHECK: call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %{{.*}})
   return _mm_cvtpd_ps(A);
 }
 
 __m128i test_mm_cvtps_epi32(__m128 A) {
   // CHECK-LABEL: test_mm_cvtps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %{{.*}})
   return _mm_cvtps_epi32(A);
 }
 
 __m128d test_mm_cvtps_pd(__m128 A) {
   // CHECK-LABEL: test_mm_cvtps_pd
-  // CHECK: call <2 x double> @llvm.x86.sse2.cvtps2pd
+  // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: fpext <2 x float> %{{.*}} to <2 x double>
   return _mm_cvtps_pd(A);
 }
 
@@ -386,13 +495,13 @@ double test_mm_cvtsd_f64(__m128d A) {
 
 int test_mm_cvtsd_si32(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_si32
-  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si
+  // CHECK: call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %{{.*}})
   return _mm_cvtsd_si32(A);
 }
 
 long long test_mm_cvtsd_si64(__m128d A) {
   // CHECK-LABEL: test_mm_cvtsd_si64
-  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64
+  // CHECK: call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %{{.*}})
   return _mm_cvtsd_si64(A);
 }
 
@@ -424,6 +533,9 @@ __m128d test_mm_cvtsi32_sd(__m128d A, int B) {
 __m128i test_mm_cvtsi32_si128(int A) {
   // CHECK-LABEL: test_mm_cvtsi32_si128
   // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
   return _mm_cvtsi32_si128(A);
 }
 
@@ -437,6 +549,7 @@ __m128d test_mm_cvtsi64_sd(__m128d A, long long B) {
 __m128i test_mm_cvtsi64_si128(long long A) {
   // CHECK-LABEL: test_mm_cvtsi64_si128
   // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
   return _mm_cvtsi64_si128(A);
 }
 
@@ -450,13 +563,13 @@ __m128d test_mm_cvtss_sd(__m128d A, __m128 B) {
 
 __m128i test_mm_cvttpd_epi32(__m128d A) {
   // CHECK-LABEL: test_mm_cvttpd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq
+  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %{{.*}})
   return _mm_cvttpd_epi32(A);
 }
 
 __m128i test_mm_cvttps_epi32(__m128 A) {
   // CHECK-LABEL: test_mm_cvttps_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.cvttps2dq
+  // CHECK: fptosi <4 x float> %{{.*}} to <4 x i32>
   return _mm_cvttps_epi32(A);
 }
 
@@ -482,7 +595,10 @@ __m128d test_mm_div_pd(__m128d A, __m128d B) {
 
 __m128d test_mm_div_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_div_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fdiv double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_div_sd(A, B);
 }
 
@@ -491,10 +607,11 @@ int test_mm_extract_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_extract_epi16
   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
   // CHECK: extractelement <8 x i16> %{{.*}}, i32 [[x]]
-  return _mm_extract_epi16(A, 8);
+  // CHECK: zext i16 %{{.*}} to i32
+  return _mm_extract_epi16(A, 9);
 }
 
-__m128i test_mm_insert_epi16(__m128i A, short B) {
+__m128i test_mm_insert_epi16(__m128i A, int B) {
   // CHECK-LABEL: test_mm_insert_epi16
   // CHECK: [[x:%.*]] = and i32 %{{.*}}, 7
   // CHECK: insertelement <8 x i16> %{{.*}}, i32 [[x]]
@@ -513,9 +630,17 @@ __m128d test_mm_load_pd(double const* A) {
   return _mm_load_pd(A);
 }
 
+__m128d test_mm_load_pd1(double const* A) {
+  // CHECK-LABEL: test_mm_load_pd1
+  // CHECK: load double, double* %{{.*}}, align 8
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_load_pd1(A);
+}
+
 __m128d test_mm_load_sd(double const* A) {
   // CHECK-LABEL: test_mm_load_sd
-  // CHECK: load double, double* %{{.*}}, align 1
+  // CHECK: load double, double* %{{.*}}, align 1{{$}}
   return _mm_load_sd(A);
 }
 
@@ -536,9 +661,27 @@ __m128d test_mm_load1_pd(double const* A) {
 __m128d test_mm_loadh_pd(__m128d x, void* y) {
   // CHECK-LABEL: test_mm_loadh_pd
   // CHECK: load double, double* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loadh_pd(x, y);
 }
 
+__m128i test_mm_loadl_epi64(__m128i* y) {
+  // CHECK: test_mm_loadl_epi64
+  // CHECK: load i64, i64* {{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x i64> undef, i64 {{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> {{.*}}, i64 0, i32 1
+  return _mm_loadl_epi64(y);
+}
+
+__m128d test_mm_loadl_pd(__m128d x, void* y) {
+  // CHECK-LABEL: test_mm_loadl_pd
+  // CHECK: load double, double* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_loadl_pd(x, y);
+}
+
 __m128d test_mm_loadr_pd(double const* A) {
   // CHECK-LABEL: test_mm_loadr_pd
   // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 16
@@ -548,16 +691,24 @@ __m128d test_mm_loadr_pd(double const* A) {
 
 __m128d test_mm_loadu_pd(double const* A) {
   // CHECK-LABEL: test_mm_loadu_pd
-  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1
+  // CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
   return _mm_loadu_pd(A);
 }
 
 __m128i test_mm_loadu_si128(__m128i const* A) {
   // CHECK-LABEL: test_mm_loadu_si128
-  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1
+  // CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
   return _mm_loadu_si128(A);
 }
 
+__m128i test_mm_loadu_si64(void const* A) {
+  // CHECK-LABEL: test_mm_loadu_si64
+  // CHECK: load i64, i64* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 0, i32 1
+  return _mm_loadu_si64(A);
+}
+
 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_madd_epi16
   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
@@ -572,13 +723,15 @@ void test_mm_maskmoveu_si128(__m128i A, __m128i B, char* C) {
 
 __m128i test_mm_max_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp sgt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
   return _mm_max_epi16(A, B);
 }
 
 __m128i test_mm_max_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_max_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp ugt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_max_epu8(A, B);
 }
 
@@ -602,13 +755,15 @@ void test_mm_mfence() {
 
 __m128i test_mm_min_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp slt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
   return _mm_min_epi16(A, B);
 }
 
 __m128i test_mm_min_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_min_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
+  // CHECK:       [[CMP:%.*]] = icmp ult <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_min_epu8(A, B);
 }
 
@@ -624,6 +779,21 @@ __m128d test_mm_min_sd(__m128d A, __m128d B) {
   return _mm_min_sd(A, B);
 }
 
+__m128i test_mm_move_epi64(__m128i A) {
+  // CHECK-LABEL: test_mm_move_epi64
+  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2>
+  return _mm_move_epi64(A);
+}
+
+__m128d test_mm_move_sd(__m128d A, __m128d B) {
+  // CHECK-LABEL: test_mm_move_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_move_sd(A, B);
+}
+
 int test_mm_movemask_epi8(__m128i A) {
   // CHECK-LABEL: test_mm_movemask_epi8
   // CHECK: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
@@ -650,7 +820,10 @@ __m128d test_mm_mul_pd(__m128d A, __m128d B) {
 
 __m128d test_mm_mul_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_mul_sd
-  // CHECK: fmul double %{{.*}}, %{{.*}}
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: fmul double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_mul_sd(A, B);
 }
 
@@ -714,6 +887,206 @@ __m128i test_mm_sad_epu8(__m128i A, __m128i B) {
   return _mm_sad_epu8(A, B);
 }
 
+__m128i test_mm_set_epi8(char A, char B, char C, char D,
+                         char E, char F, char G, char H,
+                         char I, char J, char K, char L,
+                         char M, char N, char O, char P) {
+  // CHECK-LABEL: test_mm_set_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_set_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
+}
+
+__m128i test_mm_set_epi16(short A, short B, short C, short D,
+                          short E, short F, short G, short H) {
+  // CHECK-LABEL: test_mm_set_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_set_epi16(A, B, C, D, E, F, G, H);
+}
+
+__m128i test_mm_set_epi32(int A, int B, int C, int D) {
+  // CHECK-LABEL: test_mm_set_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_set_epi32(A, B, C, D);
+}
+
+__m128i test_mm_set_epi64(__m64 A, __m64 B) {
+  // CHECK-LABEL: test_mm_set_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set_epi64(A, B);
+}
+
+__m128i test_mm_set_epi64x(long long A, long long B) {
+  // CHECK-LABEL: test_mm_set_epi64x
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set_epi64x(A, B);
+}
+
+__m128d test_mm_set_pd(double A, double B) {
+  // CHECK-LABEL: test_mm_set_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_set_pd(A, B);
+}
+
+__m128d test_mm_set_sd(double A) {
+  // CHECK-LABEL: test_mm_set_sd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double 0.000000e+00, i32 1
+  return _mm_set_sd(A);
+}
+
+__m128i test_mm_set1_epi8(char A) {
+  // CHECK-LABEL: test_mm_set1_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_set1_epi8(A);
+}
+
+__m128i test_mm_set1_epi16(short A) {
+  // CHECK-LABEL: test_mm_set1_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_set1_epi16(A);
+}
+
+__m128i test_mm_set1_epi32(int A) {
+  // CHECK-LABEL: test_mm_set1_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_set1_epi32(A);
+}
+
+__m128i test_mm_set1_epi64(__m64 A) {
+  // CHECK-LABEL: test_mm_set1_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set1_epi64(A);
+}
+
+__m128i test_mm_set1_epi64x(long long A) {
+  // CHECK-LABEL: test_mm_set1_epi64x
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_set1_epi64x(A);
+}
+
+__m128d test_mm_set1_pd(double A) {
+  // CHECK-LABEL: test_mm_set1_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_set1_pd(A);
+}
+
+__m128i test_mm_setr_epi8(char A, char B, char C, char D,
+                          char E, char F, char G, char H,
+                          char I, char J, char K, char L,
+                          char M, char N, char O, char P) {
+  // CHECK-LABEL: test_mm_setr_epi8
+  // CHECK: insertelement <16 x i8> undef, i8 %{{.*}}, i32 0
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 3
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 4
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 5
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 6
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 7
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 8
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 9
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 10
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 11
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 12
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 13
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 14
+  // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 15
+  return _mm_setr_epi8(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P);
+}
+
+__m128i test_mm_setr_epi16(short A, short B, short C, short D,
+                           short E, short F, short G, short H) {
+  // CHECK-LABEL: test_mm_setr_epi16
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 7
+  return _mm_setr_epi16(A, B, C, D, E, F, G, H);
+}
+
+__m128i test_mm_setr_epi32(int A, int B, int C, int D) {
+  // CHECK-LABEL: test_mm_setr_epi32
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 %{{.*}}, i32 3
+  return _mm_setr_epi32(A, B, C, D);
+}
+
+__m128i test_mm_setr_epi64(__m64 A, __m64 B) {
+  // CHECK-LABEL: test_mm_setr_epi64
+  // CHECK: insertelement <2 x i64> undef, i64 %{{.*}}, i32 0
+  // CHECK: insertelement <2 x i64> %{{.*}}, i64 %{{.*}}, i32 1
+  return _mm_setr_epi64(A, B);
+}
+
+__m128d test_mm_setr_pd(double A, double B) {
+  // CHECK-LABEL: test_mm_setr_pd
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
+  return _mm_setr_pd(A, B);
+}
+
 __m128d test_mm_setzero_pd() {
   // CHECK-LABEL: test_mm_setzero_pd
   // CHECK: store <2 x double> zeroinitializer
@@ -752,37 +1125,37 @@ __m128i test_mm_shufflelo_epi16(__m128i A) {
 
 __m128i test_mm_sll_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sll_epi16(A, B);
 }
 
 __m128i test_mm_sll_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sll_epi32(A, B);
 }
 
 __m128i test_mm_sll_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sll_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sll_epi64(A, B);
 }
 
 __m128i test_mm_slli_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi16(A, 1);
 }
 
 __m128i test_mm_slli_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi32(A, 1);
 }
 
 __m128i test_mm_slli_epi64(__m128i A) {
   // CHECK-LABEL: test_mm_slli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_slli_epi64(A, 1);
 }
 
@@ -792,6 +1165,12 @@ __m128i test_mm_slli_si128(__m128i A) {
   return _mm_slli_si128(A, 5);
 }
 
+__m128i test_mm_slli_si128_2(__m128i A) {
+  // CHECK-LABEL: test_mm_slli_si128_2
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  return _mm_slli_si128(A, 17);
+}
+
 __m128d test_mm_sqrt_pd(__m128d A) {
   // CHECK-LABEL: test_mm_sqrt_pd
   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %{{.*}})
@@ -801,66 +1180,70 @@ __m128d test_mm_sqrt_pd(__m128d A) {
 __m128d test_mm_sqrt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sqrt_sd
   // CHECK: call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %{{.*}})
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_sqrt_sd(A, B);
 }
 
 __m128i test_mm_sra_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sra_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sra_epi16(A, B);
 }
 
 __m128i test_mm_sra_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_sra_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sra_epi32(A, B);
 }
 
 __m128i test_mm_srai_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_srai_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi16(A, 1);
 }
 
 __m128i test_mm_srai_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_srai_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srai_epi32(A, 1);
 }
 
 __m128i test_mm_srl_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_srl_epi16(A, B);
 }
 
 __m128i test_mm_srl_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_srl_epi32(A, B);
 }
 
 __m128i test_mm_srl_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_srl_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_srl_epi64(A, B);
 }
 
 __m128i test_mm_srli_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi16(A, 1);
 }
 
 __m128i test_mm_srli_epi32(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d
+  // CHECK: call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi32(A, 1);
 }
 
 __m128i test_mm_srli_epi64(__m128i A) {
   // CHECK-LABEL: test_mm_srli_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q
+  // CHECK: call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %{{.*}}, i32 %{{.*}})
   return _mm_srli_epi64(A, 1);
 }
 
@@ -870,14 +1253,28 @@ __m128i test_mm_srli_si128(__m128i A) {
   return _mm_srli_si128(A, 5);
 }
 
+__m128i test_mm_srli_si128_2(__m128i A) {
+  // CHECK-LABEL: test_mm_srli_si128_2
+  // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  return _mm_srli_si128(A, 17);
+}
+
 void test_mm_store_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_store_pd
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
   _mm_store_pd(A, B);
 }
 
+void test_mm_store_pd1(double* x, __m128d y) {
+  // CHECK-LABEL: test_mm_store_pd1
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* {{.*}}, align 16
+  _mm_store_pd1(x, y);
+}
+
 void test_mm_store_sd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_store_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_store_sd(A, B);
 }
@@ -888,27 +1285,52 @@ void test_mm_store_si128(__m128i* A, __m128i B) {
   _mm_store_si128(A, B);
 }
 
+void test_mm_store1_pd(double* x, __m128d y) {
+  // CHECK-LABEL: test_mm_store1_pd
+  // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> zeroinitializer
+  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16
+  _mm_store1_pd(x, y);
+}
+
 void test_mm_storeh_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storeh_pd
-  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 1
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_storeh_pd(A, B);
 }
 
+void test_mm_storel_epi64(__m128i x, void* y) {
+  // CHECK-LABEL: test_mm_storel_epi64
+  // CHECK: extractelement <2 x i64> %{{.*}}, i32 0
+  // CHECK: store {{.*}} i64* {{.*}}, align 1{{$}}
+  _mm_storel_epi64(y, x);
+}
+
 void test_mm_storel_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storel_pd
-  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1{{$}}
   _mm_storel_pd(A, B);
 }
 
+void test_mm_storer_pd(__m128d A, double* B) {
+  // CHECK-LABEL: test_mm_storer_pd
+  // CHECK: shufflevector <2 x double> {{.*}}, <2 x double> {{.*}}, <2 x i32> <i32 1, i32 0>
+  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 16{{$}}
+  _mm_storer_pd(B, A);
+}
+
 void test_mm_storeu_pd(double* A, __m128d B) {
   // CHECK-LABEL: test_mm_storeu_pd
-  // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1
+  // CHECK: store {{.*}} <2 x double>* {{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm_storeu_pd(A, B);
 }
 
 void test_mm_storeu_si128(__m128i* A, __m128i B) {
   // CHECK-LABEL: test_mm_storeu_si128
-  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1
+  // CHECK: store <2 x i64> %{{.*}}, <2 x i64>* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
   _mm_storeu_si128(A, B);
 }
 
@@ -968,70 +1390,85 @@ __m128d test_mm_sub_pd(__m128d A, __m128d B) {
 
 __m128d test_mm_sub_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_sub_sd
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
+  // CHECK: extractelement <2 x double> %{{.*}}, i32 0
   // CHECK: fsub double
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 0
   return _mm_sub_sd(A, B);
 }
 
 __m128i test_mm_subs_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epi8(A, B);
 }
 
 __m128i test_mm_subs_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epi16(A, B);
 }
 
 __m128i test_mm_subs_epu8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epu8
-  // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b
+  // CHECK: call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_subs_epu8(A, B);
 }
 
 __m128i test_mm_subs_epu16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_subs_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w
+  // CHECK: call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_subs_epu16(A, B);
 }
 
 int test_mm_ucomieq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomieq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomieq_sd(A, B);
 }
 
 int test_mm_ucomige_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomige_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomige.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomige_sd(A, B);
 }
 
 int test_mm_ucomigt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomigt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomigt_sd(A, B);
 }
 
 int test_mm_ucomile_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomile_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomile.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomile_sd(A, B);
 }
 
 int test_mm_ucomilt_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomilt_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomilt_sd(A, B);
 }
 
 int test_mm_ucomineq_sd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_ucomineq_sd
-  // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd
+  // CHECK: call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_ucomineq_sd(A, B);
 }
 
+__m128d test_mm_undefined_pd() {
+  // CHECK-LABEL: @test_mm_undefined_pd
+  // CHECK: ret <2 x double> undef
+  return _mm_undefined_pd();
+}
+
+__m128i test_mm_undefined_si128() {
+  // CHECK-LABEL: @test_mm_undefined_si128
+  // CHECK: ret <2 x i64> undef
+  return _mm_undefined_si128();
+}
+
 __m128i test_mm_unpackhi_epi8(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_unpackhi_epi8
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
diff --git a/test/CodeGen/sse3-builtins.c b/test/CodeGen/sse3-builtins.c
index 71a34e9372a25..b046c431fd50a 100644
--- a/test/CodeGen/sse3-builtins.c
+++ b/test/CodeGen/sse3-builtins.c
@@ -5,51 +5,55 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse3-intrinsics-fast-isel.ll
+
 __m128d test_mm_addsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_addsub_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_addsub_pd(A, B);
 }
 
 __m128 test_mm_addsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_addsub_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_addsub_ps(A, B);
 }
 
 __m128d test_mm_hadd_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hadd_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hadd_pd(A, B);
 }
 
 __m128 test_mm_hadd_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hadd_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.hadd.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hadd_ps(A, B);
 }
 
 __m128d test_mm_hsub_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_hsub_pd
-  // CHECK: call <2 x double> @llvm.x86.sse3.hsub.pd
+  // CHECK: call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_hsub_pd(A, B);
 }
 
 __m128 test_mm_hsub_ps(__m128 A, __m128 B) {
   // CHECK-LABEL: test_mm_hsub_ps
-  // CHECK: call <4 x float> @llvm.x86.sse3.hsub.ps
+  // CHECK: call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_hsub_ps(A, B);
 }
 
 __m128i test_mm_lddqu_si128(__m128i const* P) {
   // CHECK-LABEL: test_mm_lddqu_si128
-  // CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq
+  // CHECK: call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %{{.*}})
   return _mm_lddqu_si128(P);
 }
 
 __m128d test_mm_loaddup_pd(double const* P) {
   // CHECK-LABEL: test_mm_loaddup_pd
   // CHECK: load double*
+  // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0
+  // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1
   return _mm_loaddup_pd(P);
 }
 
diff --git a/test/CodeGen/sse41-builtins.c b/test/CodeGen/sse41-builtins.c
index 9cd5c45659fc5..ad24ecd5ed1b1 100644
--- a/test/CodeGen/sse41-builtins.c
+++ b/test/CodeGen/sse41-builtins.c
@@ -6,6 +6,8 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+
 __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
   // CHECK-LABEL: test_mm_blend_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
@@ -26,140 +28,154 @@ __m128 test_mm_blend_ps(__m128 V1, __m128 V2) {
 
 __m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) {
   // CHECK-LABEL: test_mm_blendv_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb
+  // CHECK: call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_blendv_epi8(V1, V2, V3);
 }
 
 __m128d test_mm_blendv_pd(__m128d V1, __m128d V2, __m128d V3) {
   // CHECK-LABEL: test_mm_blendv_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.blendvpd
+  // CHECK: call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
   return _mm_blendv_pd(V1, V2, V3);
 }
 
 __m128 test_mm_blendv_ps(__m128 V1, __m128 V2, __m128 V3) {
   // CHECK-LABEL: test_mm_blendv_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps
+  // CHECK: call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
   return _mm_blendv_ps(V1, V2, V3);
 }
 
 __m128d test_mm_ceil_pd(__m128d x) {
   // CHECK-LABEL: test_mm_ceil_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
   return _mm_ceil_pd(x);
 }
 
 __m128 test_mm_ceil_ps(__m128 x) {
   // CHECK-LABEL: test_mm_ceil_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
   return _mm_ceil_ps(x);
 }
 
 __m128d test_mm_ceil_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_ceil_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
   return _mm_ceil_sd(x, y);
 }
 
 __m128 test_mm_ceil_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_ceil_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
   return _mm_ceil_ss(x, y);
 }
 
 __m128i test_mm_cmpeq_epi64(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpeq_epi64
   // CHECK: icmp eq <2 x i64>
+  // CHECK: sext <2 x i1> %{{.*}} to <2 x i64>
   return _mm_cmpeq_epi64(A, B);
 }
 
 __m128i test_mm_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi16
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
   return _mm_cvtepi8_epi16(a);
 }
 
 __m128i test_mm_cvtepi8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi32
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: sext <4 x i8> {{.*}} to <4 x i32>
   return _mm_cvtepi8_epi32(a);
 }
 
 __m128i test_mm_cvtepi8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi64
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i8> {{.*}} to <2 x i64>
   return _mm_cvtepi8_epi64(a);
 }
 
 __m128i test_mm_cvtepi16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi32
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: sext <4 x i16> {{.*}} to <4 x i32>
   return _mm_cvtepi16_epi32(a);
 }
 
 __m128i test_mm_cvtepi16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi16_epi64
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i16> {{.*}} to <2 x i64>
   return _mm_cvtepi16_epi64(a);
 }
 
 __m128i test_mm_cvtepi32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi32_epi64
+  // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
   // CHECK: sext <2 x i32> {{.*}} to <2 x i64>
   return _mm_cvtepi32_epi64(a);
 }
 
 __m128i test_mm_cvtepu8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  // CHECK: zext <8 x i8> {{.*}} to <8 x i16>
   return _mm_cvtepu8_epi16(a);
 }
 
 __m128i test_mm_cvtepu8_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i8> {{.*}} to <4 x i32>
   return _mm_cvtepu8_epi32(a);
 }
 
 __m128i test_mm_cvtepu8_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu8_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> {{.*}})
+  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i8> {{.*}} to <2 x i64>
   return _mm_cvtepu8_epi64(a);
 }
 
 __m128i test_mm_cvtepu16_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu16_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> {{.*}})
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: zext <4 x i16> {{.*}} to <4 x i32>
   return _mm_cvtepu16_epi32(a);
 }
 
 __m128i test_mm_cvtepu16_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu16_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> {{.*}})
+  // CHECK: shufflevector <8 x i16> {{.*}}, <8 x i16> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i16> {{.*}} to <2 x i64>
   return _mm_cvtepu16_epi64(a);
 }
 
 __m128i test_mm_cvtepu32_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepu32_epi64
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> {{.*}})
+  // CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> {{.*}}, <2 x i32> <i32 0, i32 1>
+  // CHECK: zext <2 x i32> {{.*}} to <2 x i64>
   return _mm_cvtepu32_epi64(a);
 }
 
 __m128d test_mm_dp_pd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_dp_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.dppd
-  return _mm_dp_pd(x, y, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.dppd(<2 x double> {{.*}}, <2 x double> {{.*}}, i8 7)
+  return _mm_dp_pd(x, y, 7);
 }
 
 __m128 test_mm_dp_ps(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_dp_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.dpps
-  return _mm_dp_ps(x, y, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.dpps(<4 x float> {{.*}}, <4 x float> {{.*}}, i8 7)
+  return _mm_dp_ps(x, y, 7);
 }
 
 int test_mm_extract_epi8(__m128i x) {
   // CHECK-LABEL: test_mm_extract_epi8
-  // CHECK: extractelement <16 x i8> %{{.*}}, i32 0
-  return _mm_extract_epi8(x, 16);
+  // CHECK: extractelement <16 x i8> %{{.*}}, i32 1
+  // CHECK: zext i8 %{{.*}} to i32
+  return _mm_extract_epi8(x, 1);
 }
 
 int test_mm_extract_epi32(__m128i x) {
@@ -174,32 +190,33 @@ long long test_mm_extract_epi64(__m128i x) {
   return _mm_extract_epi64(x, 1);
 }
 
-//TODO
-//int test_mm_extract_ps(__m128i x) {
-//  return _mm_extract_ps(_mm_add_ps(x,x), 1);
-//}
+int test_mm_extract_ps(__m128 x) {
+  // CHECK-LABEL: test_mm_extract_ps
+  // CHECK: extractelement <4 x float> %{{.*}}, i32 1
+  return _mm_extract_ps(x, 1);
+}
 
 __m128d test_mm_floor_pd(__m128d x) {
   // CHECK-LABEL: test_mm_floor_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
   return _mm_floor_pd(x);
 }
 
 __m128 test_mm_floor_ps(__m128 x) {
   // CHECK-LABEL: test_mm_floor_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
   return _mm_floor_ps(x);
 }
 
 __m128d test_mm_floor_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_floor_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
   return _mm_floor_sd(x, y);
 }
 
 __m128 test_mm_floor_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_floor_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
   return _mm_floor_ss(x, y);
 }
 
@@ -223,73 +240,81 @@ __m128i test_mm_insert_epi64(__m128i x, long long b) {
 
 __m128 test_mm_insert_ps(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_insert_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.insertps
-  return _mm_insert_ps(x, y, 5);
+  // CHECK: call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 4)
+  return _mm_insert_ps(x, y, 4);
 }
 
 __m128i test_mm_max_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pmaxsb
+  // CHECK:       [[CMP:%.*]] = icmp sgt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_max_epi8(x, y);
 }
 
-__m128i test_mm_max_epu16(__m128i x, __m128i y) {
-  // CHECK-LABEL: test_mm_max_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pmaxuw
-  return _mm_max_epu16(x, y);
-}
-
 __m128i test_mm_max_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmaxsd
+  // CHECK:       [[CMP:%.*]] = icmp sgt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_max_epi32(x, y);
 }
 
+__m128i test_mm_max_epu16(__m128i x, __m128i y) {
+  // CHECK-LABEL: test_mm_max_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ugt <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  return _mm_max_epu16(x, y);
+}
+
 __m128i test_mm_max_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_max_epu32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pmaxud
+  // CHECK:       [[CMP:%.*]] = icmp ugt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_max_epu32(x, y);
 }
 
 __m128i test_mm_min_epi8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi8
-  // CHECK: call <16 x i8> @llvm.x86.sse41.pminsb
+  // CHECK:       [[CMP:%.*]] = icmp slt <16 x i8> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <16 x i1> [[CMP]], <16 x i8> [[X]], <16 x i8> [[Y]]
   return _mm_min_epi8(x, y);
 }
 
-__m128i test_mm_min_epu16(__m128i x, __m128i y) {
-  // CHECK-LABEL: test_mm_min_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.pminuw
-  return _mm_min_epu16(x, y);
-}
-
 __m128i test_mm_min_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epi32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pminsd
+  // CHECK:       [[CMP:%.*]] = icmp slt <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_min_epi32(x, y);
 }
 
+__m128i test_mm_min_epu16(__m128i x, __m128i y) {
+  // CHECK-LABEL: test_mm_min_epu16
+  // CHECK:       [[CMP:%.*]] = icmp ult <8 x i16> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <8 x i1> [[CMP]], <8 x i16> [[X]], <8 x i16> [[Y]]
+  return _mm_min_epu16(x, y);
+}
+
 __m128i test_mm_min_epu32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_min_epu32
-  // CHECK: call <4 x i32> @llvm.x86.sse41.pminud
+  // CHECK:       [[CMP:%.*]] = icmp ult <4 x i32> [[X:%.*]], [[Y:%.*]]
+  // CHECK-NEXT:  select <4 x i1> [[CMP]], <4 x i32> [[X]], <4 x i32> [[Y]]
   return _mm_min_epu32(x, y);
 }
 
 __m128i test_mm_minpos_epu16(__m128i x) {
   // CHECK-LABEL: test_mm_minpos_epu16
-  // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %{{.*}})
   return _mm_minpos_epu16(x);
 }
 
 __m128i test_mm_mpsadbw_epu8(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mpsadbw_epu8
-  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 1)
   return _mm_mpsadbw_epu8(x, y, 1);
 }
 
 __m128i test_mm_mul_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_mul_epi32
-  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq
+  // CHECK: call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_mul_epi32(x, y);
 }
 
@@ -301,72 +326,72 @@ __m128i test_mm_mullo_epi32(__m128i x, __m128i y) {
 
 __m128i test_mm_packus_epi32(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_packus_epi32
-  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw
+  // CHECK: call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_packus_epi32(x, y);
 }
 
 __m128d test_mm_round_pd(__m128d x) {
   // CHECK-LABEL: test_mm_round_pd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd
-  return _mm_round_pd(x, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
+  return _mm_round_pd(x, 4);
 }
 
 __m128 test_mm_round_ps(__m128 x) {
   // CHECK-LABEL: test_mm_round_ps
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
-  return _mm_round_ps(x, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
+  return _mm_round_ps(x, 4);
 }
 
 __m128d test_mm_round_sd(__m128d x, __m128d y) {
   // CHECK-LABEL: test_mm_round_sd
-  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd
-  return _mm_round_sd(x, y, 2);
+  // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
+  return _mm_round_sd(x, y, 4);
 }
 
 __m128 test_mm_round_ss(__m128 x, __m128 y) {
   // CHECK-LABEL: test_mm_round_ss
-  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss
-  return _mm_round_ss(x, y, 2);
+  // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
+  return _mm_round_ss(x, y, 4);
 }
 
 __m128i test_mm_stream_load_si128(__m128i const *a) {
   // CHECK-LABEL: test_mm_stream_load_si128
-  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa
+  // CHECK: call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %{{.*}})
   return _mm_stream_load_si128(a);
 }
 
 int test_mm_test_all_ones(__m128i x) {
   // CHECK-LABEL: test_mm_test_all_ones
-  // CHECK: call i32 @llvm.x86.sse41.ptestc
+  // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_all_ones(x);
 }
 
 int test_mm_test_all_zeros(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_test_all_zeros
-  // CHECK: call i32 @llvm.x86.sse41.ptestz
+  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_all_zeros(x, y);
 }
 
 int test_mm_test_mix_ones_zeros(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_test_mix_ones_zeros
-  // CHECK: call i32 @llvm.x86.sse41.ptestnzc
+  // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_test_mix_ones_zeros(x, y);
 }
 
 int test_mm_testc_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testc_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestc
+  // CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testc_si128(x, y);
 }
 
 int test_mm_testnzc_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testnzc_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestnzc
+  // CHECK: call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testnzc_si128(x, y);
 }
 
 int test_mm_testz_si128(__m128i x, __m128i y) {
   // CHECK-LABEL: test_mm_testz_si128
-  // CHECK: call i32 @llvm.x86.sse41.ptestz
+  // CHECK: call i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_testz_si128(x, y);
 }
diff --git a/test/CodeGen/sse42-builtins.c b/test/CodeGen/sse42-builtins.c
index e3215ddaf72b4..00f7ff93afe2f 100644
--- a/test/CodeGen/sse42-builtins.c
+++ b/test/CodeGen/sse42-builtins.c
@@ -6,134 +6,118 @@
 
 #include <x86intrin.h>
 
-__m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi8
-  // CHECK: icmp sgt <16 x i8>
-  return _mm_cmpgt_epi8(A, B);
-}
-
-__m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi16
-  // CHECK: icmp sgt <8 x i16>
-  return _mm_cmpgt_epi16(A, B);
-}
-
-__m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi32
-  // CHECK: icmp sgt <4 x i32>
-  return _mm_cmpgt_epi32(A, B);
-}
-
-__m128i test_mm_cmpgt_epi64(__m128i A, __m128i B) {
-  // CHECK-LABEL: test_mm_cmpgt_epi64
-  // CHECK: icmp sgt <2 x i64>
-  return _mm_cmpgt_epi64(A, B);
-}
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
 
 int test_mm_cmpestra(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestra
-  // CHECK: @llvm.x86.sse42.pcmpestria128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestra(A, LA, B, LB, 7);
 }
 
 int test_mm_cmpestrc(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestrc
-  // CHECK: @llvm.x86.sse42.pcmpestric128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestrc(A, LA, B, LB, 7);
 }
 
 int test_mm_cmpestri(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestri
-  // CHECK: @llvm.x86.sse42.pcmpestri128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestri(A, LA, B, LB, 7);
 }
 
 __m128i test_mm_cmpestrm(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestrm
-  // CHECK: @llvm.x86.sse42.pcmpestrm128
+  // CHECK: call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestrm(A, LA, B, LB, 7);
 }
 
 int test_mm_cmpestro(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestro
-  // CHECK: @llvm.x86.sse42.pcmpestrio128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestro(A, LA, B, LB, 7);
 }
 
 int test_mm_cmpestrs(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestrs
-  // CHECK: @llvm.x86.sse42.pcmpestris128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestrs(A, LA, B, LB, 7);
 }
 
 int test_mm_cmpestrz(__m128i A, int LA, __m128i B, int LB) {
   // CHECK-LABEL: test_mm_cmpestrz
-  // CHECK: @llvm.x86.sse42.pcmpestriz128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %{{.*}}, i32 %{{.*}}, <16 x i8> %{{.*}}, i32 %{{.*}}, i8 7)
   return _mm_cmpestrz(A, LA, B, LB, 7);
 }
 
+__m128i test_mm_cmpgt_epi64(__m128i A, __m128i B) {
+  // CHECK-LABEL: test_mm_cmpgt_epi64
+  // CHECK: icmp sgt <2 x i64>
+  return _mm_cmpgt_epi64(A, B);
+}
+
 int test_mm_cmpistra(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistra
-  // CHECK: @llvm.x86.sse42.pcmpistria128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistra(A, B, 7);
 }
 
 int test_mm_cmpistrc(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrc
-  // CHECK: @llvm.x86.sse42.pcmpistric128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrc(A, B, 7);
 }
 
 int test_mm_cmpistri(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistri
-  // CHECK: @llvm.x86.sse42.pcmpistri128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistri(A, B, 7);
 }
 
 __m128i test_mm_cmpistrm(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrm
-  // CHECK: @llvm.x86.sse42.pcmpistrm128
+  // CHECK: call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrm(A, B, 7);
 }
 
 int test_mm_cmpistro(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistro
-  // CHECK: @llvm.x86.sse42.pcmpistrio128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistro(A, B, 7);
 }
 
 int test_mm_cmpistrs(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrs
-  // CHECK: @llvm.x86.sse42.pcmpistris128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrs(A, B, 7);
 }
 
 int test_mm_cmpistrz(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistrz
-  // CHECK: @llvm.x86.sse42.pcmpistriz128
+  // CHECK: call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 7)
   return _mm_cmpistrz(A, B, 7);
 }
 
 unsigned int test_mm_crc32_u8(unsigned int CRC, unsigned char V) {
   // CHECK-LABEL: test_mm_crc32_u8
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.8
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.8(i32 %{{.*}}, i8 %{{.*}})
   return _mm_crc32_u8(CRC, V);
 }
 
 unsigned int test_mm_crc32_u16(unsigned int CRC, unsigned short V) {
   // CHECK-LABEL: test_mm_crc32_u16
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.16
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.16(i32 %{{.*}}, i16 %{{.*}})
   return _mm_crc32_u16(CRC, V);
 }
 
 unsigned int test_mm_crc32_u32(unsigned int CRC, unsigned int V) {
   // CHECK-LABEL: test_mm_crc32_u32
-  // CHECK: call i32 @llvm.x86.sse42.crc32.32.32
+  // CHECK: call i32 @llvm.x86.sse42.crc32.32.32(i32 %{{.*}}, i32 %{{.*}})
   return _mm_crc32_u32(CRC, V);
 }
 
-unsigned int test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) {
+unsigned long long test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) {
   // CHECK-LABEL: test_mm_crc32_u64
-  // CHECK: call i64 @llvm.x86.sse42.crc32.64.64
+  // CHECK: call i64 @llvm.x86.sse42.crc32.64.64(i64 %{{.*}}, i64 %{{.*}})
   return _mm_crc32_u64(CRC, V);
 }
diff --git a/test/CodeGen/sse4a-builtins.c b/test/CodeGen/sse4a-builtins.c
index 9a408b8bf4b56..0604423fe17e7 100644
--- a/test/CodeGen/sse4a-builtins.c
+++ b/test/CodeGen/sse4a-builtins.c
@@ -5,6 +5,8 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/sse4a-intrinsics-fast-isel.ll
+
 __m128i test_mm_extracti_si64(__m128i x) {
   // CHECK-LABEL: test_mm_extracti_si64
   // CHECK: call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %{{[^,]+}}, i8 3, i8 2)
@@ -31,12 +33,14 @@ __m128i test_mm_insert_si64(__m128i x, __m128i y) {
 
 void test_mm_stream_sd(double *p, __m128d a) {
   // CHECK-LABEL: test_mm_stream_sd
-  // CHECK: call void @llvm.x86.sse4a.movnt.sd(i8* %{{[^,]+}}, <2 x double> %{{[^,]+}})
-  _mm_stream_sd(p, a);
+  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
+  // CHECK: store double %{{.*}}, double* %{{.*}}, align 1, !nontemporal
+   _mm_stream_sd(p, a);
 }
 
 void test_mm_stream_ss(float *p, __m128 a) {
   // CHECK-LABEL: test_mm_stream_ss
-  // CHECK: call void @llvm.x86.sse4a.movnt.ss(i8* %{{[^,]+}}, <4 x float> %{{[^,]+}})
+  // CHECK: extractelement <4 x float> %{{.*}}, i64 0
+  // CHECK: store float %{{.*}}, float* %{{.*}}, align 1, !nontemporal
   _mm_stream_ss(p, a);
 }
diff --git a/test/CodeGen/ssse3-builtins.c b/test/CodeGen/ssse3-builtins.c
index d4b27a1e855c0..673387c066b29 100644
--- a/test/CodeGen/ssse3-builtins.c
+++ b/test/CodeGen/ssse3-builtins.c
@@ -5,21 +5,23 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/ssse3-intrinsics-fast-isel.ll
+
 __m128i test_mm_abs_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %{{.*}})
   return _mm_abs_epi8(a);
 }
 
 __m128i test_mm_abs_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %{{.*}})
   return _mm_abs_epi16(a);
 }
 
 __m128i test_mm_abs_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_abs_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.pabs.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %{{.*}})
   return _mm_abs_epi32(a);
 }
 
@@ -37,72 +39,72 @@ __m128i test2_mm_alignr_epi8(__m128i a, __m128i b) {
 
 __m128i test_mm_hadd_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadd_epi16(a, b);
 }
 
 __m128i test_mm_hadd_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hadd_epi32(a, b);
 }
 
 __m128i test_mm_hadds_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hadds_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hadds_epi16(a, b);
 }
 
 __m128i test_mm_hsub_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsub_epi16(a, b);
 }
 
 __m128i test_mm_hsub_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsub_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_hsub_epi32(a, b);
 }
 
 __m128i test_mm_hsubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_hsubs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_hsubs_epi16(a, b);
 }
 
 __m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_maddubs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_maddubs_epi16(a, b);
 }
 
 __m128i test_mm_mulhrs_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_mulhrs_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_mulhrs_epi16(a, b);
 }
 
 __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shuffle_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_shuffle_epi8(a, b);
 }
 
 __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi8
-  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128
+  // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sign_epi8(a, b);
 }
 
 __m128i test_mm_sign_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi16
-  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128
+  // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sign_epi16(a, b);
 }
 
 __m128i test_mm_sign_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi32
-  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128
+  // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sign_epi32(a, b);
 }
diff --git a/test/CodeGen/stack-protector.c b/test/CodeGen/stack-protector.c
index ecfbc90faa288..7a45a2f4acf6a 100644
--- a/test/CodeGen/stack-protector.c
+++ b/test/CodeGen/stack-protector.c
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 0 | FileCheck -check-prefix=NOSSP %s
-// NOSSP: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 1 | FileCheck -check-prefix=WITHSSP %s
-// WITHSSP: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 2 | FileCheck -check-prefix=SSPSTRONG %s
-// SSPSTRONG: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 3 | FileCheck -check-prefix=SSPREQ %s
-// SSPREQ: define {{.*}}void @test1(i8* %msg) #0 {
-// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack | FileCheck -check-prefix=SAFESTACK %s
-// SAFESTACK: define {{.*}}void @test1(i8* %msg) #0 {
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 0 | FileCheck -check-prefix=DEF -check-prefix=NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 1 | FileCheck -check-prefix=DEF -check-prefix=SSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 2 | FileCheck -check-prefix=DEF -check-prefix=SSPSTRONG %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -stack-protector 3 | FileCheck -check-prefix=DEF -check-prefix=SSPREQ %s
+
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 0 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-NOSSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 1 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSP %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 2 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSPSTRONG %s
+// RUN: %clang_cc1 -emit-llvm -o - %s -fsanitize=safe-stack -stack-protector 3 | FileCheck -check-prefix=DEF -check-prefix=SAFESTACK-SSPREQ %s
 
 typedef __SIZE_TYPE__ size_t;
 
@@ -15,18 +15,21 @@ int printf(const char * _Format, ...);
 size_t strlen(const char *s);
 char *strcpy(char *s1, const char *s2);
 
+// DEF: define {{.*}}void @test1(i8* %msg) #[[A:.*]] {
 void test1(const char *msg) {
   char a[strlen(msg) + 1];
   strcpy(a, msg);
   printf("%s\n", a);
 }
 
-// NOSSP: attributes #{{.*}} = { nounwind{{.*}} }
-
-// WITHSSP: attributes #{{.*}} = { nounwind ssp{{.*}} }
-
-// SSPSTRONG: attributes #{{.*}} = { nounwind sspstrong{{.*}} }
+// NOSSP-NOT: attributes #[[A]] = {{.*}} ssp
+// SSP: attributes #[[A]] = {{.*}} ssp{{ }}
+// SSPSTRONG: attributes #[[A]] = {{.*}} sspstrong
+// SSPREQ: attributes #[[A]] = {{.*}} sspreq
 
-// SSPREQ: attributes #{{.*}} = { nounwind sspreq{{.*}} }
+// SAFESTACK-NOSSP: attributes #[[A]] = {{.*}} safestack
+// SAFESTACK-NOSSP-NOT: ssp
 
-// SAFESTACK: attributes #{{.*}} = { nounwind safestack{{.*}} }
+// SAFESTACK-SSP: attributes #[[A]] = {{.*}} safestack ssp{{ }}
+// SAFESTACK-SSPSTRONG: attributes #[[A]] = {{.*}} safestack sspstrong
+// SAFESTACK-SSPREQ: attributes #[[A]] = {{.*}} safestack sspreq
diff --git a/test/CodeGen/struct-union-BE.c b/test/CodeGen/struct-union-BE.c
new file mode 100644
index 0000000000000..69ab1e8dcc4d5
--- /dev/null
+++ b/test/CodeGen/struct-union-BE.c
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple mips-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS
+// RUN: %clang_cc1 -triple mips64-linux-gnu  -S -emit-llvm %s -o - | FileCheck %s -check-prefix=MIPS64
+// RUN: %clang_cc1 -triple armebv7-linux-gnueabihf -S -emit-llvm %s -o - | FileCheck %s -check-prefix=ARM
+
+#include <stdarg.h>
+
+extern void abort() __attribute__((noreturn));
+
+struct tiny {
+  char c;
+};
+
+union data {
+  char c;
+};
+
+void fstr(int n, ...) {
+  struct tiny x;
+  va_list ap;
+  va_start (ap,n);
+  x = va_arg (ap, struct tiny);
+  if (x.c !=  10)
+    abort();
+  va_end (ap);
+// MIPS-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+// MIPS64-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i64 7
+// ARM-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+}
+
+void funi(int n, ...) {
+  union data x;
+  va_list ap;
+  va_start (ap,n);
+  x = va_arg (ap, union data);
+  if (x.c !=  10)
+    abort();
+  va_end (ap);
+// MIPS-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+// MIPS64-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i64 7
+// ARM-NOT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %argp.cur, i32 3
+}
+
+void foo() {
+  struct tiny x[3];
+  union data y;
+  x[0].c = 10;
+  fstr(1, x[0]);
+  funi(1, y);
+}
diff --git a/test/CodeGen/target-builtin-error-2.c b/test/CodeGen/target-builtin-error-2.c
index 949f2cc78466a..2e2691a784e13 100644
--- a/test/CodeGen/target-builtin-error-2.c
+++ b/test/CodeGen/target-builtin-error-2.c
@@ -5,9 +5,9 @@
 
 // Since we do code generation on a function level this needs to error out since
 // the subtarget feature won't be available.
-__m256d wombat(__m128i a) {
+__m128 wombat(__m128i a) {
   if (__builtin_cpu_supports("avx"))
-    return __builtin_ia32_cvtdq2pd256((__v4si)a); // expected-error {{'__builtin_ia32_cvtdq2pd256' needs target feature avx}}
+    return __builtin_ia32_vpermilvarps((__v4sf) {0.0f, 1.0f, 2.0f, 3.0f}, (__v4si)a); // expected-error {{'__builtin_ia32_vpermilvarps' needs target feature avx}}
   else
-    return (__m256d){0, 0, 0, 0};
+    return (__m128){0, 0};
 }
diff --git a/test/CodeGen/target-builtin-error-3.c b/test/CodeGen/target-builtin-error-3.c
new file mode 100644
index 0000000000000..5beb474befe07
--- /dev/null
+++ b/test/CodeGen/target-builtin-error-3.c
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -S -verify -o - -target-feature +avx
+
+// RUN: not %clang_cc1 %s -triple=x86_64-apple-darwin -emit-obj -target-feature +avx 2> %t.err
+// RUN: FileCheck < %t.err %s
+// CHECK: 1 error generated
+
+typedef unsigned short uint16_t;
+typedef long long __m128i __attribute__((__vector_size__(16)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef float __m256 __attribute__ ((__vector_size__ (32)));
+typedef uint16_t half;
+typedef __attribute__ ((ext_vector_type( 8),__aligned__( 16))) half half8;
+typedef __attribute__ ((ext_vector_type(16),__aligned__( 32))) half half16;
+typedef __attribute__ ((ext_vector_type(16),__aligned__( 2))) half half16U;
+typedef __attribute__ ((ext_vector_type( 8),__aligned__( 32))) float float8;
+typedef __attribute__ ((ext_vector_type(16),__aligned__( 64))) float float16;
+static inline half8 __attribute__((__overloadable__)) convert_half( float8 a ) {
+  return __extension__ ({ __m256 __a = (a); (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)__a, (0x00)); }); // expected-error {{'__builtin_ia32_vcvtps2ph256' needs target feature f16c}}
+}
+static inline half16 __attribute__((__overloadable__)) convert_half( float16 a ) {
+  half16 r; 
+  r.lo = convert_half( a.lo); 
+  return r;
+}
+void avx_test( uint16_t *destData, float16 argbF)
+{
+   ((half16U*)destData)[0] = convert_half(argbF);
+}
diff --git a/test/CodeGen/target-builtin-noerror.c b/test/CodeGen/target-builtin-noerror.c
index 7d86b9684624e..2a7d69f1089f9 100644
--- a/test/CodeGen/target-builtin-noerror.c
+++ b/test/CodeGen/target-builtin-noerror.c
@@ -42,3 +42,34 @@ __m128 __attribute__((target("fma4"))) fma_2(__m128 a, __m128 b, __m128 c) {
 __m128 __attribute__((target("fma,fma4"))) fma_3(__m128 a, __m128 b, __m128 c) {
   return __builtin_ia32_vfmaddps(a, b, c);
 }
+
+void verifyfeaturestrings() {
+  (void)__builtin_cpu_supports("cmov");
+  (void)__builtin_cpu_supports("mmx");
+  (void)__builtin_cpu_supports("popcnt");
+  (void)__builtin_cpu_supports("sse");
+  (void)__builtin_cpu_supports("sse2");
+  (void)__builtin_cpu_supports("sse3");
+  (void)__builtin_cpu_supports("ssse3");
+  (void)__builtin_cpu_supports("sse4.1");
+  (void)__builtin_cpu_supports("sse4.2");
+  (void)__builtin_cpu_supports("avx");
+  (void)__builtin_cpu_supports("avx2");
+  (void)__builtin_cpu_supports("sse4a");
+  (void)__builtin_cpu_supports("fma4");
+  (void)__builtin_cpu_supports("xop");
+  (void)__builtin_cpu_supports("fma");
+  (void)__builtin_cpu_supports("avx512f");
+  (void)__builtin_cpu_supports("bmi");
+  (void)__builtin_cpu_supports("bmi2");
+  (void)__builtin_cpu_supports("aes");
+  (void)__builtin_cpu_supports("pclmul");
+  (void)__builtin_cpu_supports("avx512vl");
+  (void)__builtin_cpu_supports("avx512bw");
+  (void)__builtin_cpu_supports("avx512dq");
+  (void)__builtin_cpu_supports("avx512cd");
+  (void)__builtin_cpu_supports("avx512er");
+  (void)__builtin_cpu_supports("avx512pf");
+  (void)__builtin_cpu_supports("avx512vbmi");
+  (void)__builtin_cpu_supports("avx512ifma");
+}
diff --git a/test/CodeGen/target-data.c b/test/CodeGen/target-data.c
index 2ed7f0916fcee..49406a30e5d93 100644
--- a/test/CodeGen/target-data.c
+++ b/test/CodeGen/target-data.c
@@ -86,6 +86,10 @@
 // RUN: FileCheck %s -check-prefix=WEBASSEMBLY64
 // WEBASSEMBLY64: target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -o - -emit-llvm %s | \
+// RUN: FileCheck %s -check-prefix=LANAI
+// LANAI: target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64"
+
 // RUN: %clang_cc1 -triple powerpc-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=PPC
 // PPC: target datalayout = "E-m:e-p:32:32-i64:64-n32"
@@ -128,16 +132,16 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown -target-cpu hawaii -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SI
-// R600SI: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SI: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 // Test default -target-cpu
 // RUN: %clang_cc1 -triple amdgcn-unknown -o - -emit-llvm %s \
 // RUN: | FileCheck %s -check-prefix=R600SIDefault
-// R600SIDefault: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+// R600SIDefault: target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
-// AARCH64: target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 // RUN: %clang_cc1 -triple thumb-unknown-gnueabi -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=THUMB
@@ -157,7 +161,7 @@
 
 // RUN: %clang_cc1 -triple hexagon-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=HEXAGON
-// HEXAGON: target datalayout = "e-m:e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a:0-n16:32"
+// HEXAGON: target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 
 // RUN: %clang_cc1 -triple s390x-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SYSTEMZ
diff --git a/test/CodeGen/target-features-error-2.c b/test/CodeGen/target-features-error-2.c
index c23d152dcfb3c..683d9ab99ef62 100644
--- a/test/CodeGen/target-features-error-2.c
+++ b/test/CodeGen/target-features-error-2.c
@@ -1,7 +1,38 @@
-// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o -
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_SSE42
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_1
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_2
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_3
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -S -verify -o - -D NEED_AVX_4
+
 #define __MM_MALLOC_H
 #include <x86intrin.h>
 
+#if NEED_SSE42
 int baz(__m256i a) {
   return _mm256_extract_epi32(a, 3); // expected-error {{always_inline function '_mm256_extract_epi32' requires target feature 'sse4.2', but would be inlined into function 'baz' that is compiled without support for 'sse4.2'}}
 }
+#endif
+
+#if NEED_AVX_1
+__m128 need_avx(__m128 a, __m128 b) {
+  return _mm_cmp_ps(a, b, 0); // expected-error {{'__builtin_ia32_cmpps' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_2
+__m128 need_avx(__m128 a, __m128 b) {
+  return _mm_cmp_ss(a, b, 0); // expected-error {{'__builtin_ia32_cmpss' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_3
+__m128d need_avx(__m128d a, __m128d b) {
+  return _mm_cmp_pd(a, b, 0); // expected-error {{'__builtin_ia32_cmppd' needs target feature avx}}
+}
+#endif
+
+#if NEED_AVX_4
+__m128d need_avx(__m128d a, __m128d b) {
+  return _mm_cmp_sd(a, b, 0); // expected-error {{'__builtin_ia32_cmpsd' needs target feature avx}}
+}
+#endif
diff --git a/test/CodeGen/tbaa-class.cpp b/test/CodeGen/tbaa-class.cpp
index f611ae5abb885..7172e05d9e34c 100644
--- a/test/CodeGen/tbaa-class.cpp
+++ b/test/CodeGen/tbaa-class.cpp
@@ -199,7 +199,7 @@ uint32_t g12(StructC *C, StructD *D, uint64_t count) {
 }
 
 // CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_cxx_tbaa:!.*]],
-// CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C++ TBAA"}
 // CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
 // CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
 // CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
diff --git a/test/CodeGen/tbaa-for-vptr.cpp b/test/CodeGen/tbaa-for-vptr.cpp
index 35e95a54dab21..7b8ae2099e470 100644
--- a/test/CodeGen/tbaa-for-vptr.cpp
+++ b/test/CodeGen/tbaa-for-vptr.cpp
@@ -32,4 +32,4 @@ void CallFoo(A *a, int (A::*fp)() const) {
 //
 // CHECK: [[NUM]] = !{[[TYPE:!.*]], [[TYPE]], i64 0}
 // CHECK: [[TYPE]] = !{!"vtable pointer", !{{.*}}
-// NOTBAA-NOT: = !{!"Simple C/C++ TBAA"}
+// NOTBAA-NOT: = !{!"Simple C++ TBAA"}
diff --git a/test/CodeGen/tbaa.cpp b/test/CodeGen/tbaa.cpp
index c43ca58bc3f62..432c41e10793c 100644
--- a/test/CodeGen/tbaa.cpp
+++ b/test/CodeGen/tbaa.cpp
@@ -1,6 +1,10 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -no-struct-path-tbaa -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=PATH
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O0 -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -O1 -relaxed-aliasing -disable-llvm-optzns %s -emit-llvm -o - | FileCheck %s -check-prefix=NO-TBAA
 // Test TBAA metadata generated by front-end.
+//
+// NO-TBAA-NOT: !tbaa
 
 typedef unsigned char uint8_t;
 typedef unsigned short uint16_t;
@@ -237,7 +241,7 @@ uint32_t g15(StructS *S, StructS3 *S3, uint64_t count) {
 }
 
 // CHECK: [[TYPE_char:!.*]] = !{!"omnipotent char", [[TAG_cxx_tbaa:!.*]],
-// CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C/C++ TBAA"}
+// CHECK: [[TAG_cxx_tbaa]] = !{!"Simple C++ TBAA"}
 // CHECK: [[TAG_i32]] = !{[[TYPE_i32:!.*]], [[TYPE_i32]], i64 0}
 // CHECK: [[TYPE_i32]] = !{!"int", [[TYPE_char]],
 // CHECK: [[TAG_i16]] = !{[[TYPE_i16:!.*]], [[TYPE_i16]], i64 0}
diff --git a/test/CodeGen/tbm-builtins.c b/test/CodeGen/tbm-builtins.c
index 29e147a1aff74..c8a9382ed17de 100644
--- a/test/CodeGen/tbm-builtins.c
+++ b/test/CodeGen/tbm-builtins.c
@@ -8,46 +8,56 @@
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/tbm-intrinsics-fast-isel.ll
+
 unsigned int test__bextri_u32(unsigned int a) {
-  // CHECK: call i32 @llvm.x86.tbm.bextri.u32
+  // CHECK-LABEL: test__bextri_u32
+  // CHECK: call i32 @llvm.x86.tbm.bextri.u32(i32 %{{.*}}, i32 1)
   return __bextri_u32(a, 1);
 }
 
 unsigned long long test__bextri_u64(unsigned long long a) {
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64
+  // CHECK-LABEL: test__bextri_u64
+  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 2)
   return __bextri_u64(a, 2);
 }
 
 unsigned long long test__bextri_u64_bigint(unsigned long long a) {
-  // CHECK: call i64 @llvm.x86.tbm.bextri.u64
+  // CHECK-LABEL: test__bextri_u64_bigint
+  // CHECK: call i64 @llvm.x86.tbm.bextri.u64(i64 %{{.*}}, i64 549755813887)
   return __bextri_u64(a, 0x7fffffffffLL);
 }
 
 unsigned int test__blcfill_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcfill_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: %{{.*}} = and i32 [[TMP]], [[SRC]]
   return __blcfill_u32(a);
 }
 
 unsigned long long test__blcfill_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcfill_u64
   // CHECK: [[TMPT:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: %{{.*}} = and i64 [[TMP]], [[SRC]]
   return __blcfill_u64(a);
 }
 
 unsigned int test__blci_u32(unsigned int a) {
+  // CHECK-LABEL: test__blci_u32
   // CHECK: [[TMP:%.*]] = sub i32 -2, [[SRC:%.*]]
   // CHECK-NEXT: %{{.*}} = or i32 [[TMP]], [[SRC]]
   return __blci_u32(a);
 }
 
 unsigned long long test__blci_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blci_u64
   // CHECK: [[TMP:%.*]] = sub i64 -2, [[SRC:%.*]]
   // CHECK-NEXT: %{{.*}} = or i64 [[TMP]], [[SRC]]
   return __blci_u64(a);
 }
 
 unsigned int test__blcic_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcic_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC]], 1
   // CHECK-NEXT: {{.*}} = and i32 [[TMP2]], [[TMP1]]
@@ -55,6 +65,7 @@ unsigned int test__blcic_u32(unsigned int a) {
 }
 
 unsigned long long test__blcic_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcic_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC]], 1
   // CHECK-NEXT: {{.*}} = and i64 [[TMP2]], [[TMP1]]
@@ -62,42 +73,49 @@ unsigned long long test__blcic_u64(unsigned long long a) {
 }
 
 unsigned int test__blcmsk_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcmsk_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = xor i32 [[TMP]], [[SRC]]
   return __blcmsk_u32(a);
 }
 
 unsigned long long test__blcmsk_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcmsk_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = xor i64 [[TMP]], [[SRC]]
   return __blcmsk_u64(a);
 }
 
 unsigned int test__blcs_u32(unsigned int a) {
+  // CHECK-LABEL: test__blcs_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP]], [[SRC]]
   return __blcs_u32(a);
 }
 
 unsigned long long test__blcs_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blcs_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP]], [[SRC]]
   return __blcs_u64(a);
 }
 
 unsigned int test__blsfill_u32(unsigned int a) {
+  // CHECK-LABEL: test__blsfill_u32
   // CHECK: [[TMP:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP]], [[SRC]]
   return __blsfill_u32(a);
 }
 
 unsigned long long test__blsfill_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blsfill_u64
   // CHECK: [[TMP:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP]], [[SRC]]
   return __blsfill_u64(a);
 }
 
 unsigned int test__blsic_u32(unsigned int a) {
+  // CHECK-LABEL: test__blsic_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP2]], [[TMP1]]
@@ -105,6 +123,7 @@ unsigned int test__blsic_u32(unsigned int a) {
 }
 
 unsigned long long test__blsic_u64(unsigned long long a) {
+  // CHECK-LABEL: test__blsic_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP2]], [[TMP1]]
@@ -112,6 +131,7 @@ unsigned long long test__blsic_u64(unsigned long long a) {
 }
 
 unsigned int test__t1mskc_u32(unsigned int a) {
+  // CHECK-LABEL: test__t1mskc_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i32 [[TMP2]], [[TMP1]]
@@ -119,6 +139,7 @@ unsigned int test__t1mskc_u32(unsigned int a) {
 }
 
 unsigned long long test__t1mskc_u64(unsigned long long a) {
+  // CHECK-LABEL: test__t1mskc_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], 1
   // CHECK-NEXT: {{.*}} = or i64 [[TMP2]], [[TMP1]]
@@ -126,6 +147,7 @@ unsigned long long test__t1mskc_u64(unsigned long long a) {
 }
 
 unsigned int test__tzmsk_u32(unsigned int a) {
+  // CHECK-LABEL: test__tzmsk_u32
   // CHECK: [[TMP1:%.*]] = xor i32 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i32 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = and i32 [[TMP2]], [[TMP1]]
@@ -133,6 +155,7 @@ unsigned int test__tzmsk_u32(unsigned int a) {
 }
 
 unsigned long long test__tzmsk_u64(unsigned long long a) {
+  // CHECK-LABEL: test__tzmsk_u64
   // CHECK: [[TMP1:%.*]] = xor i64 [[SRC:%.*]], -1
   // CHECK-NEXT: [[TMP2:%.*]] = add i64 [[SRC:%.*]], -1
   // CHECK-NEXT: {{.*}} = and i64 [[TMP2]], [[TMP1]]
diff --git a/test/CodeGen/temporary-lifetime-exceptions.cpp b/test/CodeGen/temporary-lifetime-exceptions.cpp
new file mode 100644
index 0000000000000..17e21683f22c7
--- /dev/null
+++ b/test/CodeGen/temporary-lifetime-exceptions.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 %s -fexceptions -fcxx-exceptions -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck %s
+
+// lifetime.end should be invoked even if the destructor doesn't run due to an
+// exception thrown from previous ctor call.
+
+struct A { A(); ~A(); };
+A Baz(const A&);
+
+void Test1() {
+  // CHECK-LABEL: @_Z5Test1v(
+  // CHECK: getelementptr
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP:[^ ]+]])
+  // CHECK-NEXT: getelementptr
+  // CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* [[TMP1:[^ ]+]])
+
+  // Normal exit
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+
+  // Exception exit
+  // CHECK: call void @llvm.lifetime.end(i64 1, i8* [[TMP1]])
+  // CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* [[TMP]])
+  Baz(Baz(A()));
+}
diff --git a/test/CodeGen/temporary-lifetime.cpp b/test/CodeGen/temporary-lifetime.cpp
new file mode 100644
index 0000000000000..f6dd3e0b2a53c
--- /dev/null
+++ b/test/CodeGen/temporary-lifetime.cpp
@@ -0,0 +1,168 @@
+// RUN: %clang_cc1 %s -std=c++11 -O1 -DWITH_DTOR -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-DTOR %s
+// RUN: %clang_cc1 %s -std=c++11 -O1 -triple x86_64 -emit-llvm -o - | FileCheck -check-prefix=CHECK-NO-DTOR %s
+
+struct A {
+  A();
+#ifdef WITH_DTOR
+  ~A();
+#endif
+  char a[1024];
+  operator bool() const;
+};
+
+template <typename T>
+void Foo(T &&);
+
+template <typename T>
+void Bar(T &&);
+
+template <typename T>
+T Baz();
+
+void Test1() {
+  // CHECK-DTOR-LABEL: Test1
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+
+  // CHECK-NO-DTOR-LABEL: Test1
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-NO-DTOR: }
+  {
+    const A &a = A{};
+    Foo(a);
+  }
+  {
+    const A &a = A{};
+    Foo(a);
+  }
+}
+
+void Test2() {
+  // CHECK-DTOR-LABEL: Test2
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR2]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[VAR1]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-DTOR: }
+
+  // CHECK-NO-DTOR-LABEL: Test2
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR1:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR1:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR2:[0-9]+]])
+  // CHECK-NO-DTOR: call void @_ZN1AC1Ev(%struct.A* nonnull %[[VAR2:[^ ]+]])
+  // CHECK-NO-DTOR: call void @_Z3FooIRK1AEvOT_
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR2]])
+  // CHECK-NO-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR1]])
+  // CHECK-NO-DTOR: }
+  const A &a = A{};
+  Foo(a);
+  const A &b = A{};
+  Foo(b);
+}
+
+void Test3() {
+  // CHECK-DTOR-LABEL: Test3
+  // CHECK-DTOR: call void @llvm.lifetime.start
+  // CHECK-DTOR: call void @llvm.lifetime.start
+
+  // if.then:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+  // CHECK-DTOR: }
+  const A &a = A{};
+  if (const A &b = A(a)) {
+    Foo(b);
+    return;
+  }
+  Bar(a);
+}
+
+void Test4() {
+  // CHECK-DTOR-LABEL: Test4
+  // CHECK-DTOR: call void @llvm.lifetime.start
+
+  // for.cond.cleanup:
+  // CHECK-DTOR: call void @llvm.lifetime.end
+
+  // for.body:
+  // CHECK-DTOR: }
+  for (const A &a = A{}; a;) {
+    Foo(a);
+  }
+}
+
+int Test5() {
+  // CHECK-DTOR-LABEL: Test5
+  // CHECK-DTOR: call void @llvm.lifetime.start
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIRKiEvOT_
+  // CHECK-DTOR: load
+  // CHECK-DTOR: call void @llvm.lifetime.end
+  // CHECK-DTOR: }
+  const int &a = Baz<int>();
+  Foo(a);
+  return a;
+}
+
+void Test6() {
+  // CHECK-DTOR-LABEL: Test6
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIiEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 {{[0-9]+}}, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call i32 @_Z3BazIiET_v()
+  // CHECK-DTOR: store
+  // CHECK-DTOR: call void @_Z3FooIiEvOT_
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 {{[0-9]+}}, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+  Foo(Baz<int>());
+  Foo(Baz<int>());
+}
+
+void Test7() {
+  // CHECK-DTOR-LABEL: Test7
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: call void @llvm.lifetime.start(i64 1024, i8* %[[ADDR:[0-9]+]])
+  // CHECK-DTOR: call void @_Z3BazI1AET_v({{.*}} %[[SLOT:[^ ]+]])
+  // CHECK-DTOR: call void @_Z3FooI1AEvOT_({{.*}} %[[SLOT]])
+  // CHECK-DTOR: call void @_ZN1AD1Ev(%struct.A* nonnull %[[SLOT]])
+  // CHECK-DTOR: call void @llvm.lifetime.end(i64 1024, i8* %[[ADDR]])
+  // CHECK-DTOR: }
+  Foo(Baz<A>());
+  Foo(Baz<A>());
+}
diff --git a/test/CodeGen/thinlto_backend.c b/test/CodeGen/thinlto_backend.c
deleted file mode 100644
index a2737fb80c7d9..0000000000000
--- a/test/CodeGen/thinlto_backend.c
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: %clang -O2 %s -flto=thin -c -o %t.o
-// RUN: llvm-lto -thinlto -o %t %t.o
-
-// Ensure clang -cc1 give expected error for incorrect input type
-// RUN: not %clang_cc1 -O2 -o %t1.o %s -c -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-WARNING
-// CHECK-WARNING: error: invalid argument '-fthinlto-index={{.*}}' only allowed with '-x ir'
-
-// Ensure we get expected error for missing index file
-// RUN: %clang -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
-// CHECK-ERROR: Error loading index file 'bad.thinlto.bc'
-
-// Ensure Function Importing pass added
-// RUN: %clang -O2 -o %t1.o -x ir %t.o -c -fthinlto-index=%t.thinlto.bc -mllvm -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=CHECK-PASS
-// CHECK-PASS: Function Importing
diff --git a/test/CodeGen/thinlto_backend.ll b/test/CodeGen/thinlto_backend.ll
new file mode 100644
index 0000000000000..0fb2643e037d4
--- /dev/null
+++ b/test/CodeGen/thinlto_backend.ll
@@ -0,0 +1,29 @@
+; REQUIRES: x86-registered-target
+
+; RUN: opt -module-summary -o %t1.o %s
+; RUN: opt -module-summary -o %t2.o %S/Inputs/thinlto_backend.ll
+; RUN: llvm-lto -thinlto -o %t %t1.o %t2.o
+
+; Ensure clang -cc1 give expected error for incorrect input type
+; RUN: not %clang_cc1 -O2 -o %t1.o -x c %s -c -fthinlto-index=%t.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-WARNING
+; CHECK-WARNING: error: invalid argument '-fthinlto-index={{.*}}' only allowed with '-x ir'
+
+; Ensure we get expected error for missing index file
+; RUN: %clang -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=bad.thinlto.bc 2>&1 | FileCheck %s -check-prefix=CHECK-ERROR
+; CHECK-ERROR: Error loading index file 'bad.thinlto.bc'
+
+; Ensure f2 was imported
+; RUN: %clang -target x86_64-unknown-linux-gnu -O2 -o %t3.o -x ir %t1.o -c -fthinlto-index=%t.thinlto.bc
+; RUN: llvm-nm %t3.o | FileCheck --check-prefix=CHECK-OBJ %s
+; CHECK-OBJ: T f1
+; CHECK-OBJ-NOT: U f2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @f2()
+
+define void @f1() {
+  call void @f2()
+  ret void
+}
diff --git a/test/CodeGen/ubsan-strip-path-components.cpp b/test/CodeGen/ubsan-strip-path-components.cpp
new file mode 100644
index 0000000000000..7a95324d128be
--- /dev/null
+++ b/test/CodeGen/ubsan-strip-path-components.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - | FileCheck %s -check-prefix=REGULAR -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=0 | FileCheck %s -check-prefix=REGULAR -check-prefix=CHECK
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=2 | FileCheck %s -check-prefix=REMOVE-FIRST-TWO -check-prefix=CHECK
+
+// Try to strip too much:
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-99999 | FileCheck %s -check-prefix=REGULAR
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=99999 | FileCheck %s -check-prefix=LAST-ONLY
+
+// Check stripping from the file name
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-2 | FileCheck %s -check-prefix=LAST-TWO
+// RUN: %clang_cc1 %s -triple=x86_64-linux-gnu -emit-llvm -fsanitize=unreachable -o - -fsanitize-undefined-strip-path-components=-1 | FileCheck %s -check-prefix=LAST-ONLY
+
+// REGULAR: @[[SRC:[0-9.a-zA-Z_]+]] =      private unnamed_addr constant [{{.*}} x i8] c"{{.*test(.|\\5C)CodeGen(.|\\5C)ubsan-strip-path-components\.cpp}}\00", align 1
+
+// First path component: "/" or "$drive_letter:", then a name, or '\5C' on Windows
+// REMOVE-FIRST-TWO: @[[STR:[0-9.a-zA-Z_]+]] = private unnamed_addr constant [{{.*}} x i8] c"{{(.:|/)([^\\/]*(/|\\5C))}}[[REST:.*ubsan-strip-path-components\.cpp]]\00", align 1
+// REMOVE-FIRST-TWO: @[[SRC:[0-9.a-zA-Z_]+]] = private unnamed_addr constant [{{.*}} x i8] c"[[REST]]\00", align 1
+
+// LAST-TWO: @[[SRC:[0-9.a-zA-Z_]+]] =     private unnamed_addr constant [{{.*}} x i8] c"CodeGen{{/|\\5C}}ubsan-strip-path-components.cpp\00", align 1
+// LAST-ONLY: @[[SRC:[0-9.a-zA-Z_]+]] =    private unnamed_addr constant [{{.*}} x i8] c"ubsan-strip-path-components.cpp\00", align 1
+
+// CHECK: @[[STATIC_DATA:[0-9.a-zA-Z_]+]] = private unnamed_addr global { { [{{.*}} x i8]*, i32, i32 } } { { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 [[@LINE+6]], i32 3 } }
+void g(const char *);
+void f() {
+  // CHECK-LABEL: @_Z1fv(
+  g(__FILE__);
+  // CHECK: call void @__ubsan_handle_builtin_unreachable(i8* bitcast ({ { [{{.*}} x i8]*, i32, i32 } }* @[[STATIC_DATA]] to i8*)) {{.*}}, !nosanitize
+  __builtin_unreachable();
+}
diff --git a/test/CodeGen/vector.c b/test/CodeGen/vector.c
index 8e820f23fb77c..14f5079726398 100644
--- a/test/CodeGen/vector.c
+++ b/test/CodeGen/vector.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple i386-apple-darwin9 -O1 -target-cpu pentium4 -target-feature +sse4.1 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i386-apple-darwin9 -O1 -target-cpu core2 -debug-info-kind=limited -emit-llvm %s -o - | FileCheck %s
 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
 
 void test1() {
@@ -62,3 +62,23 @@ void extractinttypes() {
   extern __typeof(_mm_extract_epi16(_mm_setzero_si128(), 3)) check_result_int;
   extern __typeof(_mm_extract_epi32(_mm_setzero_si128(), 3)) check_result_int;
 }
+
+// Test some logic around our lax vector comparison rules with integers.
+
+typedef int vec_int1 __attribute__((vector_size(4)));
+vec_int1 lax_vector_compare1(int x, vec_int1 y) {
+  y = x == y;
+  return y;
+}
+
+// CHECK: define i32 @lax_vector_compare1(i32 {{.*}}, i32 {{.*}})
+// CHECK: icmp eq <1 x i32>
+
+typedef int vec_int2 __attribute__((vector_size(8)));
+vec_int2 lax_vector_compare2(long long x, vec_int2 y) {
+  y = x == y;
+  return y;
+}
+
+// CHECK: define void @lax_vector_compare2(<2 x i32>* {{.*sret.*}}, i64 {{.*}}, i64 {{.*}})
+// CHECK: icmp eq <2 x i32>
diff --git a/test/CodeGen/vectorcall.c b/test/CodeGen/vectorcall.c
index 9ee35b1a02b64..b38d5e5fbc5b8 100644
--- a/test/CodeGen/vectorcall.c
+++ b/test/CodeGen/vectorcall.c
@@ -9,9 +9,9 @@ void __vectorcall v2(char a, char b) {}
 // CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
 // X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
 
-struct Small { int a; };
+struct Small { int x; };
 void __vectorcall v3(int a, struct Small b, int c) {}
-// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c)
+// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, i32 %b.0, i32 inreg %c)
 // X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
 
 struct Large { int a[5]; };
diff --git a/test/CodeGen/wasm-varargs.c b/test/CodeGen/wasm-varargs.c
new file mode 100644
index 0000000000000..b8e488ec14404
--- /dev/null
+++ b/test/CodeGen/wasm-varargs.c
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple wasm32-unknown-unknown -o - -emit-llvm %s | FileCheck %s
+
+#include <stdarg.h>
+
+int test_i32(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  int v = va_arg(va, int);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: define i32 @test_i32(i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca i32, align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 4
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to i32*
+// CHECK:   [[R4:%[^,=]+]] = load i32, i32* [[R3]], align 4
+// CHECK:   store i32 [[R4]], i32* [[V]], align 4
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R5:%[^,=]+]] = load i32, i32* [[V]], align 4
+// CHECK:   ret i32 [[R5]]
+// CHECK: }
+
+long long test_i64(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  long long v = va_arg(va, long long);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK-LABEL: define i64 @test_i64(i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca i64, align 8
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[R0:%[^,=]+]] = ptrtoint i8* [[ARGP_CUR]] to i32
+// CHECK:   [[R1:%[^,=]+]] = add i32 [[R0]], 7
+// CHECK:   [[R2:%[^,=]+]] = and i32 [[R1]], -8
+// CHECK:   [[ARGP_CUR_ALIGNED:%[^,=]+]] = inttoptr i32 [[R2]] to i8*
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR_ALIGNED]], i32 8
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR_ALIGNED]] to i64*
+// CHECK:   [[R4:%[^,=]+]] = load i64, i64* [[R3]], align 8
+// CHECK:   store i64 [[R4]], i64* [[V]], align 8
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R5:%[^,=]+]] = load i64, i64* [[V]], align 8
+// CHECK:   ret i64 [[R5]]
+// CHECK: }
+
+struct S {
+    int x;
+    int y;
+    int z;
+};
+
+struct S test_struct(char *fmt, ...) {
+  va_list va;
+
+  va_start(va, fmt);
+  struct S v = va_arg(va, struct S);
+  va_end(va);
+
+  return v;
+}
+
+// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret %agg.result, i8*{{.*}} %fmt, ...) {{.*}} {
+// CHECK:   [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[VA:%[^,=]+]] = alloca i8*, align 4
+// CHECK:   [[V:%[^,=]+]] = alloca [[STRUCT_S]], align 4
+// CHECK:   store i8* %fmt, i8** [[FMT_ADDR]], align 4
+// CHECK:   [[VA1:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_start(i8* [[VA1]])
+// CHECK:   [[ARGP_CUR:%[^,=]+]] = load i8*, i8** [[VA]], align 4
+// CHECK:   [[ARGP_NEXT:%[^,=]+]] = getelementptr inbounds i8, i8* [[ARGP_CUR]], i32 12
+// CHECK:   store i8* [[ARGP_NEXT]], i8** [[VA]], align 4
+// CHECK:   [[R3:%[^,=]+]] = bitcast i8* [[ARGP_CUR]] to [[STRUCT_S]]*
+// CHECK:   [[R4:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   [[R5:%[^,=]+]] = bitcast [[STRUCT_S]]* [[R3]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R4]], i8* [[R5]], i32 12, i32 4, i1 false)
+// CHECK:   [[VA2:%[^,=]+]] = bitcast i8** [[VA]] to i8*
+// CHECK:   call void @llvm.va_end(i8* [[VA2]])
+// CHECK:   [[R6:%[^,=]+]] = bitcast [[STRUCT_S]]* %agg.result to i8*
+// CHECK:   [[R7:%[^,=]+]] = bitcast [[STRUCT_S]]* [[V]] to i8*
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[R6]], i8* [[R7]], i32 12, i32 4, i1 false)
+// CHECK:   ret void
+// CHECK: }
diff --git a/test/CodeGen/windows-on-arm-tls-support.c b/test/CodeGen/windows-on-arm-tls-support.c
new file mode 100644
index 0000000000000..dfb8b27fd7f17
--- /dev/null
+++ b/test/CodeGen/windows-on-arm-tls-support.c
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -triple thumbv7--windows -fms-extensions -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+__declspec(thread) int i;
+
diff --git a/test/CodeGen/windows-struct-abi.c b/test/CodeGen/windows-struct-abi.c
index 4b4a6f1b5db31..1631f61db90cd 100644
--- a/test/CodeGen/windows-struct-abi.c
+++ b/test/CodeGen/windows-struct-abi.c
@@ -10,7 +10,7 @@ struct f1 return_f1(void) { while (1); }
 
 void receive_f1(struct f1 a0) { }
 
-// CHECK: define void @receive_f1(%struct.f1* byval align 4 %a0)
+// CHECK: define void @receive_f1(float %a0.0)
 
 struct f2 {
   float f;
@@ -23,7 +23,7 @@ struct f2 return_f2(void) { while (1); }
 
 void receive_f2(struct f2 a0) { }
 
-// CHECK: define void @receive_f2(%struct.f2* byval align 4 %a0)
+// CHECK: define void @receive_f2(float %a0.0, float %a0.1)
 
 struct f4 {
   float f;
@@ -38,5 +38,5 @@ struct f4 return_f4(void) { while (1); }
 
 void receive_f4(struct f4 a0) { }
 
-// CHECK: define void @receive_f4(%struct.f4* byval align 4 %a0)
+// CHECK: define void @receive_f4(float %a0.0, float %a0.1, float %a0.2, float %a0.3)
 
diff --git a/test/CodeGen/x86_32-arguments-win32.c b/test/CodeGen/x86_32-arguments-win32.c
index f8b09957b36e7..7b27fc746477a 100644
--- a/test/CodeGen/x86_32-arguments-win32.c
+++ b/test/CodeGen/x86_32-arguments-win32.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -w -triple i386-pc-win32 -emit-llvm -o - %s | FileCheck %s
 
 // CHECK-LABEL: define i64 @f1_1()
-// CHECK-LABEL: define void @f1_2(%struct.s1* byval align 4 %a0)
+// CHECK-LABEL: define void @f1_2(i32 %a0.0, i32 %a0.1)
 struct s1 {
   int a;
   int b;
@@ -31,7 +31,7 @@ struct s4 {
 struct s4 f4_1(void) { while (1) {} }
 
 // CHECK-LABEL: define i64 @f5_1()
-// CHECK-LABEL: define void @f5_2(%struct.s5* byval align 4)
+// CHECK-LABEL: define void @f5_2(double %a0.0)
 struct s5 {
   double a;
 };
@@ -39,7 +39,7 @@ struct s5 f5_1(void) { while (1) {} }
 void f5_2(struct s5 a0) {}
 
 // CHECK-LABEL: define i32 @f6_1()
-// CHECK-LABEL: define void @f6_2(%struct.s6* byval align 4 %a0)
+// CHECK-LABEL: define void @f6_2(float %a0.0)
 struct s6 {
   float a;
 };
diff --git a/test/CodeGen/x86_64-arguments.c b/test/CodeGen/x86_64-arguments.c
index e3b853dc37279..a2d60cc6b1abb 100644
--- a/test/CodeGen/x86_64-arguments.c
+++ b/test/CodeGen/x86_64-arguments.c
@@ -261,12 +261,12 @@ void f33(va_list X) {
 typedef unsigned long long v1i64 __attribute__((__vector_size__(8)));
 
 // rdar://8359248
-// CHECK-LABEL: define i64 @f34(i64 %arg.coerce)
+// CHECK-LABEL: define double @f34(double %arg.coerce)
 v1i64 f34(v1i64 arg) { return arg; }
 
 
 // rdar://8358475
-// CHECK-LABEL: define i64 @f35(i64 %arg.coerce)
+// CHECK-LABEL: define double @f35(double %arg.coerce)
 typedef unsigned long v1i64_2 __attribute__((__vector_size__(8)));
 v1i64_2 f35(v1i64_2 arg) { return arg+arg; }
 
diff --git a/test/CodeGen/x86_64-longdouble.c b/test/CodeGen/x86_64-longdouble.c
index 8baf4d1649126..8aeddb47878e0 100644
--- a/test/CodeGen/x86_64-longdouble.c
+++ b/test/CodeGen/x86_64-longdouble.c
@@ -11,12 +11,12 @@
 // Android uses fp128 for long double but other x86_64 targets use x86_fp80.
 
 long double dataLD = 1.0L;
-// ANDROID: @dataLD = global fp128 0xL00000000000000003FFF000000000000, align 16
-// GNU: @dataLD = global x86_fp80 0xK3FFF8000000000000000, align 16
+// ANDROID: @dataLD = local_unnamed_addr global fp128 0xL00000000000000003FFF000000000000, align 16
+// GNU: @dataLD = local_unnamed_addr global x86_fp80 0xK3FFF8000000000000000, align 16
 
 long double _Complex dataLDC = {1.0L, 1.0L};
-// ANDROID: @dataLDC = global { fp128, fp128 } { fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000003FFF000000000000 }, align 16
-// GNU: @dataLDC = global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
+// ANDROID: @dataLDC = local_unnamed_addr global { fp128, fp128 } { fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000003FFF000000000000 }, align 16
+// GNU: @dataLDC = local_unnamed_addr global { x86_fp80, x86_fp80 } { x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK3FFF8000000000000000 }, align 16
 
 long double TestLD(long double x) {
   return x * x;
diff --git a/test/CodeGen/xop-builtins.c b/test/CodeGen/xop-builtins.c
index 5f0f20d07bbb6..3b1d1ef39cfc5 100644
--- a/test/CodeGen/xop-builtins.c
+++ b/test/CodeGen/xop-builtins.c
@@ -1,390 +1,393 @@
 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -emit-llvm -o - -Werror | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin -target-feature +xop -fno-signed-char -emit-llvm -o - -Werror | FileCheck %s
 
 // Don't include mm_malloc.h, it's system specific.
 #define __MM_MALLOC_H
 
 #include <x86intrin.h>
 
+// NOTE: This should match the tests in llvm/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+
 __m128i test_mm_maccs_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccs_epi16
-  // CHECK: @llvm.x86.xop.vpmacssww
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_maccs_epi16(a, b, c);
 }
 
 __m128i test_mm_macc_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macc_epi16
-  // CHECK: @llvm.x86.xop.vpmacsww
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_macc_epi16(a, b, c);
 }
 
 __m128i test_mm_maccsd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccsd_epi16
-  // CHECK: @llvm.x86.xop.vpmacsswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccsd_epi16(a, b, c);
 }
 
 __m128i test_mm_maccd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccd_epi16
-  // CHECK: @llvm.x86.xop.vpmacswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccd_epi16(a, b, c);
 }
 
 __m128i test_mm_maccs_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccs_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maccs_epi32(a, b, c);
 }
 
 __m128i test_mm_macc_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macc_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_macc_epi32(a, b, c);
 }
 
 __m128i test_mm_maccslo_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccslo_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdql
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maccslo_epi32(a, b, c);
 }
 
 __m128i test_mm_macclo_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macclo_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdql
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_macclo_epi32(a, b, c);
 }
 
 __m128i test_mm_maccshi_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maccshi_epi32
-  // CHECK: @llvm.x86.xop.vpmacssdqh
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_maccshi_epi32(a, b, c);
 }
 
 __m128i test_mm_macchi_epi32(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_macchi_epi32
-  // CHECK: @llvm.x86.xop.vpmacsdqh
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_macchi_epi32(a, b, c);
 }
 
 __m128i test_mm_maddsd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maddsd_epi16
-  // CHECK: @llvm.x86.xop.vpmadcsswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maddsd_epi16(a, b, c);
 }
 
 __m128i test_mm_maddd_epi16(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_maddd_epi16
-  // CHECK: @llvm.x86.xop.vpmadcswd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_maddd_epi16(a, b, c);
 }
 
 __m128i test_mm_haddw_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddw_epi8
-  // CHECK: @llvm.x86.xop.vphaddbw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphaddbw(<16 x i8> %{{.*}})
   return _mm_haddw_epi8(a);
 }
 
 __m128i test_mm_haddd_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epi8
-  // CHECK: @llvm.x86.xop.vphaddbd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddbd(<16 x i8> %{{.*}})
   return _mm_haddd_epi8(a);
 }
 
 __m128i test_mm_haddq_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi8
-  // CHECK: @llvm.x86.xop.vphaddbq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddbq(<16 x i8> %{{.*}})
   return _mm_haddq_epi8(a);
 }
 
 __m128i test_mm_haddd_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epi16
-  // CHECK: @llvm.x86.xop.vphaddwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddwd(<8 x i16> %{{.*}})
   return _mm_haddd_epi16(a);
 }
 
 __m128i test_mm_haddq_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi16
-  // CHECK: @llvm.x86.xop.vphaddwq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddwq(<8 x i16> %{{.*}})
   return _mm_haddq_epi16(a);
 }
 
 __m128i test_mm_haddq_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epi32
-  // CHECK: @llvm.x86.xop.vphadddq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphadddq(<4 x i32> %{{.*}})
   return _mm_haddq_epi32(a);
 }
 
 __m128i test_mm_haddw_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddw_epu8
-  // CHECK: @llvm.x86.xop.vphaddubw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphaddubw(<16 x i8> %{{.*}})
   return _mm_haddw_epu8(a);
 }
 
 __m128i test_mm_haddd_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epu8
-  // CHECK: @llvm.x86.xop.vphaddubd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphaddubd(<16 x i8> %{{.*}})
   return _mm_haddd_epu8(a);
 }
 
 __m128i test_mm_haddq_epu8(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu8
-  // CHECK: @llvm.x86.xop.vphaddubq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddubq(<16 x i8> %{{.*}})
   return _mm_haddq_epu8(a);
 }
 
 __m128i test_mm_haddd_epu16(__m128i a) {
   // CHECK-LABEL: test_mm_haddd_epu16
-  // CHECK: @llvm.x86.xop.vphadduwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphadduwd(<8 x i16> %{{.*}})
   return _mm_haddd_epu16(a);
 }
 
 __m128i test_mm_haddq_epu16(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu16
-  // CHECK: @llvm.x86.xop.vphadduwq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphadduwq(<8 x i16> %{{.*}})
   return _mm_haddq_epu16(a);
 }
 
 __m128i test_mm_haddq_epu32(__m128i a) {
   // CHECK-LABEL: test_mm_haddq_epu32
-  // CHECK: @llvm.x86.xop.vphaddudq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphaddudq(<4 x i32> %{{.*}})
   return _mm_haddq_epu32(a);
 }
 
 __m128i test_mm_hsubw_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_hsubw_epi8
-  // CHECK: @llvm.x86.xop.vphsubbw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vphsubbw(<16 x i8> %{{.*}})
   return _mm_hsubw_epi8(a);
 }
 
 __m128i test_mm_hsubd_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_hsubd_epi16
-  // CHECK: @llvm.x86.xop.vphsubwd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vphsubwd(<8 x i16> %{{.*}})
   return _mm_hsubd_epi16(a);
 }
 
 __m128i test_mm_hsubq_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_hsubq_epi32
-  // CHECK: @llvm.x86.xop.vphsubdq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vphsubdq(<4 x i32> %{{.*}})
   return _mm_hsubq_epi32(a);
 }
 
 __m128i test_mm_cmov_si128(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_cmov_si128
-  // CHECK: @llvm.x86.xop.vpcmov
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_cmov_si128(a, b, c);
 }
 
 __m256i test_mm256_cmov_si256(__m256i a, __m256i b, __m256i c) {
   // CHECK-LABEL: test_mm256_cmov_si256
-  // CHECK: @llvm.x86.xop.vpcmov.256
+  // CHECK: call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}})
   return _mm256_cmov_si256(a, b, c);
 }
 
 __m128i test_mm_perm_epi8(__m128i a, __m128i b, __m128i c) {
   // CHECK-LABEL: test_mm_perm_epi8
-  // CHECK: @llvm.x86.xop.vpperm
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_perm_epi8(a, b, c);
 }
 
 __m128i test_mm_rot_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi8
-  // CHECK: @llvm.x86.xop.vprotb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vprotb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_rot_epi8(a, b);
 }
 
 __m128i test_mm_rot_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi16
-  // CHECK: @llvm.x86.xop.vprotw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vprotw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_rot_epi16(a, b);
 }
 
 __m128i test_mm_rot_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi32
-  // CHECK: @llvm.x86.xop.vprotd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vprotd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_rot_epi32(a, b);
 }
 
 __m128i test_mm_rot_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_rot_epi64
-  // CHECK: @llvm.x86.xop.vprotq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vprotq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_rot_epi64(a, b);
 }
 
 __m128i test_mm_roti_epi8(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi8
-  // CHECK: @llvm.x86.xop.vprotbi
+  // CHECK: call <16 x i8> @llvm.x86.xop.vprotbi(<16 x i8> %{{.*}}, i8 1)
   return _mm_roti_epi8(a, 1);
 }
 
 __m128i test_mm_roti_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi16
-  // CHECK: @llvm.x86.xop.vprotwi
+  // CHECK: call <8 x i16> @llvm.x86.xop.vprotwi(<8 x i16> %{{.*}}, i8 50)
   return _mm_roti_epi16(a, 50);
 }
 
 __m128i test_mm_roti_epi32(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi32
-  // CHECK: @llvm.x86.xop.vprotdi
+  // CHECK: call <4 x i32> @llvm.x86.xop.vprotdi(<4 x i32> %{{.*}}, i8 -30)
   return _mm_roti_epi32(a, -30);
 }
 
 __m128i test_mm_roti_epi64(__m128i a) {
   // CHECK-LABEL: test_mm_roti_epi64
-  // CHECK: @llvm.x86.xop.vprotqi
+  // CHECK: call <2 x i64> @llvm.x86.xop.vprotqi(<2 x i64> %{{.*}}, i8 100)
   return _mm_roti_epi64(a, 100);
 }
 
 __m128i test_mm_shl_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi8
-  // CHECK: @llvm.x86.xop.vpshlb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpshlb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_shl_epi8(a, b);
 }
 
 __m128i test_mm_shl_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi16
-  // CHECK: @llvm.x86.xop.vpshlw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpshlw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_shl_epi16(a, b);
 }
 
 __m128i test_mm_shl_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi32
-  // CHECK: @llvm.x86.xop.vpshld
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpshld(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_shl_epi32(a, b);
 }
 
 __m128i test_mm_shl_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_shl_epi64
-  // CHECK: @llvm.x86.xop.vpshlq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpshlq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_shl_epi64(a, b);
 }
 
 __m128i test_mm_sha_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi8
-  // CHECK: @llvm.x86.xop.vpshab
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpshab(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sha_epi8(a, b);
 }
 
 __m128i test_mm_sha_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi16
-  // CHECK: @llvm.x86.xop.vpshaw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpshaw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sha_epi16(a, b);
 }
 
 __m128i test_mm_sha_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi32
-  // CHECK: @llvm.x86.xop.vpshad
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpshad(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sha_epi32(a, b);
 }
 
 __m128i test_mm_sha_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sha_epi64
-  // CHECK: @llvm.x86.xop.vpshaq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpshaq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
   return _mm_sha_epi64(a, b);
 }
 
 __m128i test_mm_com_epu8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu8
-  // CHECK: @llvm.x86.xop.vpcomub
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 0)
   return _mm_com_epu8(a, b, 0);
 }
 
 __m128i test_mm_com_epu16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu16
-  // CHECK: @llvm.x86.xop.vpcomuw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, i8 0)
   return _mm_com_epu16(a, b, 0);
 }
 
 __m128i test_mm_com_epu32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu32
-  // CHECK: @llvm.x86.xop.vpcomud
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_com_epu32(a, b, 0);
 }
 
 __m128i test_mm_com_epu64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epu64
-  // CHECK: @llvm.x86.xop.vpcomuq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_com_epu64(a, b, 0);
 }
 
 __m128i test_mm_com_epi8(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi8
-  // CHECK: @llvm.x86.xop.vpcomb
+  // CHECK: call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, i8 0)
   return _mm_com_epi8(a, b, 0);
 }
 
 __m128i test_mm_com_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi16
-  // CHECK: @llvm.x86.xop.vpcomw
+  // CHECK: call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}, i8 0)
   return _mm_com_epi16(a, b, 0);
 }
 
 __m128i test_mm_com_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi32
-  // CHECK: @llvm.x86.xop.vpcomd
+  // CHECK: call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_com_epi32(a, b, 0);
 }
 
 __m128i test_mm_com_epi64(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_com_epi64
-  // CHECK: @llvm.x86.xop.vpcomq
+  // CHECK: call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_com_epi64(a, b, 0);
 }
 
 __m128d test_mm_permute2_pd(__m128d a, __m128d b, __m128i c) {
   // CHECK-LABEL: test_mm_permute2_pd
-  // CHECK: @llvm.x86.xop.vpermil2pd
+  // CHECK: call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i64> %{{.*}}, i8 0)
   return _mm_permute2_pd(a, b, c, 0);
 }
 
 __m256d test_mm256_permute2_pd(__m256d a, __m256d b, __m256i c) {
   // CHECK-LABEL: test_mm256_permute2_pd
-  // CHECK: @llvm.x86.xop.vpermil2pd.256
+  // CHECK: call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i64> %{{.*}}, i8 0)
   return _mm256_permute2_pd(a, b, c, 0);
 }
 
 __m128 test_mm_permute2_ps(__m128 a, __m128 b, __m128i c) {
   // CHECK-LABEL: test_mm_permute2_ps
-  // CHECK: @llvm.x86.xop.vpermil2ps
+  // CHECK: call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> %{{.*}}, i8 0)
   return _mm_permute2_ps(a, b, c, 0);
 }
 
 __m256 test_mm256_permute2_ps(__m256 a, __m256 b, __m256i c) {
   // CHECK-LABEL: test_mm256_permute2_ps
-  // CHECK: @llvm.x86.xop.vpermil2ps.256
+  // CHECK: call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> %{{.*}}, i8 0)
   return _mm256_permute2_ps(a, b, c, 0);
 }
 
 __m128 test_mm_frcz_ss(__m128 a) {
   // CHECK-LABEL: test_mm_frcz_ss
-  // CHECK: @llvm.x86.xop.vfrcz.ss
+  // CHECK: call <4 x float> @llvm.x86.xop.vfrcz.ss(<4 x float> %{{.*}})
   return _mm_frcz_ss(a);
 }
 
 __m128d test_mm_frcz_sd(__m128d a) {
   // CHECK-LABEL: test_mm_frcz_sd
-  // CHECK: @llvm.x86.xop.vfrcz.sd
+  // CHECK: call <2 x double> @llvm.x86.xop.vfrcz.sd(<2 x double> %{{.*}})
   return _mm_frcz_sd(a);
 }
 
 __m128 test_mm_frcz_ps(__m128 a) {
   // CHECK-LABEL: test_mm_frcz_ps
-  // CHECK: @llvm.x86.xop.vfrcz.ps
+  // CHECK: call <4 x float> @llvm.x86.xop.vfrcz.ps(<4 x float> %{{.*}})
   return _mm_frcz_ps(a);
 }
 
 __m128d test_mm_frcz_pd(__m128d a) {
   // CHECK-LABEL: test_mm_frcz_pd
-  // CHECK: @llvm.x86.xop.vfrcz.pd
+  // CHECK: call <2 x double> @llvm.x86.xop.vfrcz.pd(<2 x double> %{{.*}})
   return _mm_frcz_pd(a);
 }
 
 __m256 test_mm256_frcz_ps(__m256 a) {
   // CHECK-LABEL: test_mm256_frcz_ps
-  // CHECK: @llvm.x86.xop.vfrcz.ps.256
+  // CHECK: call <8 x float> @llvm.x86.xop.vfrcz.ps.256(<8 x float> %{{.*}})
   return _mm256_frcz_ps(a);
 }
 
 __m256d test_mm256_frcz_pd(__m256d a) {
   // CHECK-LABEL: test_mm256_frcz_pd
-  // CHECK: @llvm.x86.xop.vfrcz.pd.256
+  // CHECK: call <4 x double> @llvm.x86.xop.vfrcz.pd.256(<4 x double> %{{.*}})
   return _mm256_frcz_pd(a);
 }
diff --git a/test/CodeGen/xray-attributes-supported.cpp b/test/CodeGen/xray-attributes-supported.cpp
new file mode 100644
index 0000000000000..d70b3aa260132
--- /dev/null
+++ b/test/CodeGen/xray-attributes-supported.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -fxray-instrument -std=c++11 -x c++ -emit-llvm -o - -triple x86_64-unknown-linux-gnu | FileCheck %s
+
+// Make sure that the LLVM attribute for XRay-annotated functions do show up.
+[[clang::xray_always_instrument]] void foo() {
+// CHECK: define void @_Z3foov() #0
+};
+
+[[clang::xray_never_instrument]] void bar() {
+// CHECK: define void @_Z3barv() #1
+};
+
+// CHECK: #0 = {{.*}}"function-instrument"="xray-always"
+// CHECK: #1 = {{.*}}"function-instrument"="xray-never"
diff --git a/test/CodeGenCUDA/Inputs/cuda-initializers.h b/test/CodeGenCUDA/Inputs/cuda-initializers.h
new file mode 100644
index 0000000000000..186b160276512
--- /dev/null
+++ b/test/CodeGenCUDA/Inputs/cuda-initializers.h
@@ -0,0 +1,145 @@
+// CUDA struct types with interesting initialization properties.
+// Keep in sync with ../SemaCUDA/Inputs/cuda-initializers.h.
+
+// Base classes with different initializer variants.
+
+// trivial constructor -- allowed
+struct T {
+  int t;
+};
+
+// empty constructor
+struct EC {
+  int ec;
+  __device__ EC() {}     // -- allowed
+  __device__ EC(int) {}  // -- not allowed
+};
+
+// empty destructor
+struct ED {
+  __device__ ~ED() {}     // -- allowed
+};
+
+struct ECD {
+  __device__ ECD() {}     // -- allowed
+  __device__ ~ECD() {}    // -- allowed
+};
+
+// empty templated constructor -- allowed with no arguments
+struct ETC {
+  template <typename... T> __device__ ETC(T...) {}
+};
+
+// undefined constructor -- not allowed
+struct UC {
+  int uc;
+  __device__ UC();
+};
+
+// undefined destructor -- not allowed
+struct UD {
+  int ud;
+  __device__ ~UD();
+};
+
+// empty constructor w/ initializer list -- not allowed
+struct ECI {
+  int eci;
+  __device__ ECI() : eci(1) {}
+};
+
+// non-empty constructor -- not allowed
+struct NEC {
+  int nec;
+  __device__ NEC() { nec = 1; }
+};
+
+// non-empty destructor -- not allowed
+struct NED {
+  int ned;
+  __device__ ~NED() { ned = 1; }
+};
+
+// no-constructor,  virtual method -- not allowed
+struct NCV {
+  int ncv;
+  __device__ virtual void vm() {}
+};
+
+// virtual destructor -- not allowed.
+struct VD {
+  __device__ virtual ~VD() {}
+};
+
+// dynamic in-class field initializer -- not allowed
+__device__ int f();
+struct NCF {
+  int ncf = f();
+};
+
+// static in-class field initializer.  NVCC does not allow it, but
+// clang generates static initializer for this, so we'll accept it.
+// We still can't use it on __shared__ vars as they don't allow *any*
+// initializers.
+struct NCFS {
+  int ncfs = 3;
+};
+
+// undefined templated constructor -- not allowed
+struct UTC {
+  template <typename... T> __device__ UTC(T...);
+};
+
+// non-empty templated constructor -- not allowed
+struct NETC {
+  int netc;
+  template <typename... T> __device__ NETC(T...) { netc = 1; }
+};
+
+// Regular base class -- allowed
+struct T_B_T : T {};
+
+// Incapsulated object of allowed class -- allowed
+struct T_F_T {
+  T t;
+};
+
+// array of allowed objects -- allowed
+struct T_FA_T {
+  T t[2];
+};
+
+
+// Calling empty base class initializer is OK
+struct EC_I_EC : EC {
+  __device__ EC_I_EC() : EC() {}
+};
+
+// .. though passing arguments is not allowed.
+struct EC_I_EC1 : EC {
+  __device__ EC_I_EC1() : EC(1) {}
+};
+
+// Virtual base class -- not allowed
+struct T_V_T : virtual T {};
+
+// Inherited from or incapsulated class with non-empty constructor --
+// not allowed
+struct T_B_NEC : NEC {};
+struct T_F_NEC {
+  NEC nec;
+};
+struct T_FA_NEC {
+  NEC nec[2];
+};
+
+
+// Inherited from or incapsulated class with non-empty desstructor --
+// not allowed
+struct T_B_NED : NED {};
+struct T_F_NED {
+  NED ned;
+};
+struct T_FA_NED {
+  NED ned[2];
+};
diff --git a/test/CodeGenCUDA/Inputs/cuda.h b/test/CodeGenCUDA/Inputs/cuda.h
index a9a4595a14a96..9b9f43a1aaa9b 100644
--- a/test/CodeGenCUDA/Inputs/cuda.h
+++ b/test/CodeGenCUDA/Inputs/cuda.h
@@ -18,3 +18,5 @@ typedef struct cudaStream *cudaStream_t;
 
 int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
                       cudaStream_t stream = 0);
+
+extern "C" __device__ int printf(const char*, ...);
diff --git a/test/CodeGenCUDA/address-spaces.cu b/test/CodeGenCUDA/address-spaces.cu
index 31cba958e154a..449529bb24b45 100644
--- a/test/CodeGenCUDA/address-spaces.cu
+++ b/test/CodeGenCUDA/address-spaces.cu
@@ -25,8 +25,6 @@ struct MyStruct {
 // CHECK: @_ZZ5func3vE1a = internal addrspace(3) global float 0.000000e+00
 // CHECK: @_ZZ5func4vE1a = internal addrspace(3) global float 0.000000e+00
 // CHECK: @b = addrspace(3) global float undef
-// CHECK: @c = addrspace(3) global %struct.c undef
-// CHECK  @d = addrspace(3) global %struct.d undef
 
 __device__ void foo() {
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @i to i32*)
@@ -38,14 +36,6 @@ __device__ void foo() {
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @k to i32*)
   k++;
 
-  static int li;
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(1)* @_ZZ3foovE2li to i32*)
-  li++;
-
-  __constant__ int lj;
-  // CHECK: load i32, i32* addrspacecast (i32 addrspace(4)* @_ZZ3foovE2lj to i32*)
-  lj++;
-
   __shared__ int lk;
   // CHECK: load i32, i32* addrspacecast (i32 addrspace(3)* @_ZZ3foovE2lk to i32*)
   lk++;
@@ -102,32 +92,3 @@ __device__ float *func5() {
 }
 // CHECK: define float* @_Z5func5v()
 // CHECK: ret float* addrspacecast (float addrspace(3)* @b to float*)
-
-struct StructWithCtor {
-  __device__ StructWithCtor(): data(1) {}
-  __device__ StructWithCtor(const StructWithCtor &second): data(second.data) {}
-  __device__ int getData() { return data; }
-  int data;
-};
-
-__device__ int construct_shared_struct() {
-// CHECK-LABEL: define i32 @_Z23construct_shared_structv()
-  __shared__ StructWithCtor s;
-// CHECK: call void @_ZN14StructWithCtorC1Ev(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
-  __shared__ StructWithCtor t(s);
-// CHECK: call void @_ZN14StructWithCtorC1ERKS_(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*), %struct.StructWithCtor* dereferenceable(4) addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1s to %struct.StructWithCtor*))
-  return t.getData();
-// CHECK: call i32 @_ZN14StructWithCtor7getDataEv(%struct.StructWithCtor* addrspacecast (%struct.StructWithCtor addrspace(3)* @_ZZ23construct_shared_structvE1t to %struct.StructWithCtor*))
-}
-
-// Make sure we allow __shared__ structures with default or empty constructors.
-struct c {
-  int i;
-};
-__shared__ struct c c;
-
-struct d {
-  int i;
-  d() {}
-};
-__shared__ struct d d;
diff --git a/test/CodeGenCUDA/alias.cu b/test/CodeGenCUDA/alias.cu
new file mode 100644
index 0000000000000..6efff6b92aa88
--- /dev/null
+++ b/test/CodeGenCUDA/alias.cu
@@ -0,0 +1,17 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// Check that we don't generate an alias from "foo" to the mangled name for
+// ns::foo() -- nvptx doesn't support aliases.
+
+namespace ns {
+extern "C" {
+// CHECK-NOT: @foo = internal alias
+__device__ __attribute__((used)) static int foo() { return 0; }
+}
+}
diff --git a/test/CodeGenCUDA/convergent.cu b/test/CodeGenCUDA/convergent.cu
new file mode 100644
index 0000000000000..6827c57d29fbe
--- /dev/null
+++ b/test/CodeGenCUDA/convergent.cu
@@ -0,0 +1,45 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix DEVICE %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm \
+// RUN:   -disable-llvm-passes -o - %s | \
+// RUN:  FileCheck -check-prefix HOST %s
+
+#include "Inputs/cuda.h"
+
+// DEVICE: Function Attrs:
+// DEVICE-SAME: convergent
+// DEVICE-NEXT: define void @_Z3foov
+__device__ void foo() {}
+
+// HOST: Function Attrs:
+// HOST-NOT: convergent
+// HOST-NEXT: define void @_Z3barv
+// DEVICE: Function Attrs:
+// DEVICE-SAME: convergent
+// DEVICE-NEXT: define void @_Z3barv
+__host__ __device__ void baz();
+__host__ __device__ void bar() {
+  // DEVICE: call void @_Z3bazv() [[CALL_ATTR:#[0-9]+]]
+  baz();
+  // DEVICE: call i32 asm "trap;", "=l"() [[ASM_ATTR:#[0-9]+]]
+  int x;
+  asm ("trap;" : "=l"(x));
+  // DEVICE: call void asm sideeffect "trap;", ""() [[ASM_ATTR:#[0-9]+]]
+  asm volatile ("trap;");
+}
+
+// DEVICE: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
+// DEVICE: attributes [[BAZ_ATTR]] = {
+// DEVICE-SAME: convergent
+// DEVICE-SAME: }
+// DEVICE: attributes [[CALL_ATTR]] = { convergent }
+// DEVICE: attributes [[ASM_ATTR]] = { convergent
+
+// HOST: declare void @_Z3bazv() [[BAZ_ATTR:#[0-9]+]]
+// HOST: attributes [[BAZ_ATTR]] = {
+// HOST-NOT: convergent
+// NOST-SAME: }
diff --git a/test/CodeGenCUDA/cuda-builtin-vars.cu b/test/CodeGenCUDA/cuda-builtin-vars.cu
index 834e16d04d67e..c2159f5af1419 100644
--- a/test/CodeGenCUDA/cuda-builtin-vars.cu
+++ b/test/CodeGenCUDA/cuda-builtin-vars.cu
@@ -6,21 +6,21 @@
 __attribute__((global))
 void kernel(int *out) {
   int i = 0;
-  out[i++] = threadIdx.x; // CHECK: call i32 @llvm.ptx.read.tid.x()
-  out[i++] = threadIdx.y; // CHECK: call i32 @llvm.ptx.read.tid.y()
-  out[i++] = threadIdx.z; // CHECK: call i32 @llvm.ptx.read.tid.z()
+  out[i++] = threadIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  out[i++] = threadIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  out[i++] = threadIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
 
-  out[i++] = blockIdx.x; // CHECK: call i32 @llvm.ptx.read.ctaid.x()
-  out[i++] = blockIdx.y; // CHECK: call i32 @llvm.ptx.read.ctaid.y()
-  out[i++] = blockIdx.z; // CHECK: call i32 @llvm.ptx.read.ctaid.z()
+  out[i++] = blockIdx.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  out[i++] = blockIdx.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  out[i++] = blockIdx.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
 
-  out[i++] = blockDim.x; // CHECK: call i32 @llvm.ptx.read.ntid.x()
-  out[i++] = blockDim.y; // CHECK: call i32 @llvm.ptx.read.ntid.y()
-  out[i++] = blockDim.z; // CHECK: call i32 @llvm.ptx.read.ntid.z()
+  out[i++] = blockDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  out[i++] = blockDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  out[i++] = blockDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
 
-  out[i++] = gridDim.x; // CHECK: call i32 @llvm.ptx.read.nctaid.x()
-  out[i++] = gridDim.y; // CHECK: call i32 @llvm.ptx.read.nctaid.y()
-  out[i++] = gridDim.z; // CHECK: call i32 @llvm.ptx.read.nctaid.z()
+  out[i++] = gridDim.x; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  out[i++] = gridDim.y; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  out[i++] = gridDim.z; // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
 
   out[i++] = warpSize; // CHECK: store i32 32,
 
diff --git a/test/CodeGenCUDA/device-stub.cu b/test/CodeGenCUDA/device-stub.cu
index 7f5e159151cfb..5979ba3fce60f 100644
--- a/test/CodeGenCUDA/device-stub.cu
+++ b/test/CodeGenCUDA/device-stub.cu
@@ -1,7 +1,46 @@
-// RUN: %clang_cc1 -emit-llvm %s -fcuda-include-gpubinary %s -o - | FileCheck %s
+// RUN: echo "GPU binary would be here" > %t
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
+// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN
 
 #include "Inputs/cuda.h"
 
+#ifndef NOGLOBALS
+// CHECK-DAG: @device_var = internal global i32
+__device__ int device_var;
+
+// CHECK-DAG: @constant_var = internal global i32
+__constant__ int constant_var;
+
+// CHECK-DAG: @shared_var = internal global i32
+__shared__ int shared_var;
+
+// Make sure host globals don't get internalized...
+// CHECK-DAG: @host_var = global i32
+int host_var;
+// ... and that extern vars remain external.
+// CHECK-DAG: @ext_host_var = external global i32
+extern int ext_host_var;
+
+// Shadows for external device-side variables are *definitions* of
+// those variables.
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __device__ int ext_device_var;
+// CHECK-DAG: @ext_device_var = internal global i32
+extern __constant__ int ext_constant_var;
+
+void use_pointers() {
+  int *p;
+  p = &device_var;
+  p = &constant_var;
+  p = &shared_var;
+  p = &host_var;
+  p = &ext_device_var;
+  p = &ext_constant_var;
+  p = &ext_host_var;
+}
+
 // Make sure that all parts of GPU code init/cleanup are there:
 // * constant unnamed string with the kernel name
 // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00"
@@ -31,10 +70,16 @@ __global__ void kernelfunc(int i, int j, int k) {}
 // CHECK: call{{.*}}cudaConfigureCall
 // CHECK: call{{.*}}kernelfunc
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
+#endif
 
-// Test that we've built a function to register kernels
-// CHECK: define internal void @__cuda_register_kernels
+// Test that we've built a function to register kernels and global vars.
+// CHECK: define internal void @__cuda_register_globals
 // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
+// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
+// CHECK: ret void
 
 // Test that we've built contructor..
 // CHECK: define internal void @__cuda_module_ctor
@@ -42,11 +87,26 @@ void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
 //   .. stores return value in __cuda_gpubin_handle
 // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle
-//   .. and then calls __cuda_register_kernels
-// CHECK-NEXT: call void @__cuda_register_kernels
+//   .. and then calls __cuda_register_globals
+// CHECK-NEXT: call void @__cuda_register_globals
 
 // Test that we've created destructor.
 // CHECK: define internal void @__cuda_module_dtor
 // CHECK: load{{.*}}__cuda_gpubin_handle
 // CHECK-NEXT: call void @__cudaUnregisterFatBinary
 
+// There should be no __cuda_register_globals if we have no
+// device-side globals, but we still need to register GPU binary.
+// Skip GPU binary string first.
+// NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
+// NOGLOBALS-NOT: define internal void @__cuda_register_globals
+// NOGLOBALS: define internal void @__cuda_module_ctor
+// NOGLOBALS: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
+// NOGLOBALS-NOT: call void @__cuda_register_globals
+// NOGLOBALS: define internal void @__cuda_module_dtor
+// NOGLOBALS: call void @__cudaUnregisterFatBinary
+
+// There should be no constructors/destructors if we have no GPU binary.
+// NOGPUBIN-NOT: define internal void @__cuda_register_globals
+// NOGPUBIN-NOT: define internal void @__cuda_module_ctor
+// NOGPUBIN-NOT: define internal void @__cuda_module_dtor
diff --git a/test/CodeGenCUDA/device-var-init.cu b/test/CodeGenCUDA/device-var-init.cu
new file mode 100644
index 0000000000000..6f2d9294131fc
--- /dev/null
+++ b/test/CodeGenCUDA/device-var-init.cu
@@ -0,0 +1,198 @@
+// REQUIRES: nvptx-registered-target
+
+// Make sure we don't allow dynamic initialization for device
+// variables, but accept empty constructors allowed by CUDA.
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 \
+// RUN:     -fno-threadsafe-statics -emit-llvm -o - %s | FileCheck %s
+
+#ifdef __clang__
+#include "Inputs/cuda.h"
+#endif
+
+// Use the types we share with Sema tests.
+#include "Inputs/cuda-initializers.h"
+
+__device__ int d_v;
+// CHECK: @d_v = addrspace(1) externally_initialized global i32 0,
+__shared__ int s_v;
+// CHECK: @s_v = addrspace(3) global i32 undef,
+__constant__ int c_v;
+// CHECK: addrspace(4) externally_initialized global i32 0,
+
+__device__ int d_v_i = 1;
+// CHECK: @d_v_i = addrspace(1) externally_initialized global i32 1,
+
+// trivial constructor -- allowed
+__device__ T d_t;
+// CHECK: @d_t = addrspace(1) externally_initialized global %struct.T zeroinitializer
+__shared__ T s_t;
+// CHECK: @s_t = addrspace(3) global %struct.T undef,
+__constant__ T c_t;
+// CHECK: @c_t = addrspace(4) externally_initialized global %struct.T zeroinitializer,
+
+__device__ T d_t_i = {2};
+// CHECK: @d_t_i = addrspace(1) externally_initialized global %struct.T { i32 2 },
+__constant__ T c_t_i = {2};
+// CHECK: @c_t_i = addrspace(4) externally_initialized global %struct.T { i32 2 },
+
+// empty constructor
+__device__ EC d_ec;
+// CHECK: @d_ec = addrspace(1) externally_initialized global %struct.EC zeroinitializer,
+__shared__ EC s_ec;
+// CHECK: @s_ec = addrspace(3) global %struct.EC undef,
+__constant__ EC c_ec;
+// CHECK: @c_ec = addrspace(4) externally_initialized global %struct.EC zeroinitializer,
+
+// empty destructor
+__device__ ED d_ed;
+// CHECK: @d_ed = addrspace(1) externally_initialized global %struct.ED zeroinitializer,
+__shared__ ED s_ed;
+// CHECK: @s_ed = addrspace(3) global %struct.ED undef,
+__constant__ ED c_ed;
+// CHECK: @c_ed = addrspace(4) externally_initialized global %struct.ED zeroinitializer,
+
+__device__ ECD d_ecd;
+// CHECK: @d_ecd = addrspace(1) externally_initialized global %struct.ECD zeroinitializer,
+__shared__ ECD s_ecd;
+// CHECK: @s_ecd = addrspace(3) global %struct.ECD undef,
+__constant__ ECD c_ecd;
+// CHECK: @c_ecd = addrspace(4) externally_initialized global %struct.ECD zeroinitializer,
+
+// empty templated constructor -- allowed with no arguments
+__device__ ETC d_etc;
+// CHECK: @d_etc = addrspace(1) externally_initialized global %struct.ETC zeroinitializer,
+__shared__ ETC s_etc;
+// CHECK: @s_etc = addrspace(3) global %struct.ETC undef,
+__constant__ ETC c_etc;
+// CHECK: @c_etc = addrspace(4) externally_initialized global %struct.ETC zeroinitializer,
+
+__device__ NCFS d_ncfs;
+// CHECK: @d_ncfs = addrspace(1) externally_initialized global %struct.NCFS { i32 3 }
+__constant__ NCFS c_ncfs;
+// CHECK: @c_ncfs = addrspace(4) externally_initialized global %struct.NCFS { i32 3 }
+
+// Regular base class -- allowed
+__device__ T_B_T d_t_b_t;
+// CHECK: @d_t_b_t = addrspace(1) externally_initialized global %struct.T_B_T zeroinitializer,
+__shared__ T_B_T s_t_b_t;
+// CHECK: @s_t_b_t = addrspace(3) global %struct.T_B_T undef,
+__constant__ T_B_T c_t_b_t;
+// CHECK: @c_t_b_t = addrspace(4) externally_initialized global %struct.T_B_T zeroinitializer,
+
+// Incapsulated object of allowed class -- allowed
+__device__ T_F_T d_t_f_t;
+// CHECK: @d_t_f_t = addrspace(1) externally_initialized global %struct.T_F_T zeroinitializer,
+__shared__ T_F_T s_t_f_t;
+// CHECK: @s_t_f_t = addrspace(3) global %struct.T_F_T undef,
+__constant__ T_F_T c_t_f_t;
+// CHECK: @c_t_f_t = addrspace(4) externally_initialized global %struct.T_F_T zeroinitializer,
+
+// array of allowed objects -- allowed
+__device__ T_FA_T d_t_fa_t;
+// CHECK: @d_t_fa_t = addrspace(1) externally_initialized global %struct.T_FA_T zeroinitializer,
+__shared__ T_FA_T s_t_fa_t;
+// CHECK: @s_t_fa_t = addrspace(3) global %struct.T_FA_T undef,
+__constant__ T_FA_T c_t_fa_t;
+// CHECK: @c_t_fa_t = addrspace(4) externally_initialized global %struct.T_FA_T zeroinitializer,
+
+
+// Calling empty base class initializer is OK
+__device__ EC_I_EC d_ec_i_ec;
+// CHECK: @d_ec_i_ec = addrspace(1) externally_initialized global %struct.EC_I_EC zeroinitializer,
+__shared__ EC_I_EC s_ec_i_ec;
+// CHECK: @s_ec_i_ec = addrspace(3) global %struct.EC_I_EC undef,
+__constant__ EC_I_EC c_ec_i_ec;
+// CHECK: @c_ec_i_ec = addrspace(4) externally_initialized global %struct.EC_I_EC zeroinitializer,
+
+// We should not emit global initializers for device-side variables.
+// CHECK-NOT: @__cxx_global_var_init
+
+// Make sure that initialization restrictions do not apply to local
+// variables.
+__device__ void df() {
+  T t;
+  // CHECK-NOT: call
+  EC ec;
+  // CHECK:   call void @_ZN2ECC1Ev(%struct.EC* %ec)
+  ED ed;
+  // CHECK-NOT: call
+  ECD ecd;
+  // CHECK:   call void @_ZN3ECDC1Ev(%struct.ECD* %ecd)
+  ETC etc;
+  // CHECK:   call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* %etc)
+  UC uc;
+  // undefined constructor -- not allowed
+  // CHECK:   call void @_ZN2UCC1Ev(%struct.UC* %uc)
+  UD ud;
+  // undefined destructor -- not allowed
+  // CHECK-NOT: call
+  ECI eci;
+  // empty constructor w/ initializer list -- not allowed
+  // CHECK:   call void @_ZN3ECIC1Ev(%struct.ECI* %eci)
+  NEC nec;
+  // non-empty constructor -- not allowed
+  // CHECK:   call void @_ZN3NECC1Ev(%struct.NEC* %nec)
+  // non-empty destructor -- not allowed
+  NED ned;
+  // no-constructor,  virtual method -- not allowed
+  // CHECK:   call void @_ZN3NCVC1Ev(%struct.NCV* %ncv)
+  NCV ncv;
+  // CHECK-NOT: call
+  VD vd;
+  // CHECK:   call void @_ZN2VDC1Ev(%struct.VD* %vd)
+  NCF ncf;
+  // CHECK:   call void @_ZN3NCFC1Ev(%struct.NCF* %ncf)
+  NCFS ncfs;
+  // CHECK:   call void @_ZN4NCFSC1Ev(%struct.NCFS* %ncfs)
+  UTC utc;
+  // CHECK:   call void @_ZN3UTCC1IJEEEDpT_(%struct.UTC* %utc)
+  NETC netc;
+  // CHECK:   call void @_ZN4NETCC1IJEEEDpT_(%struct.NETC* %netc)
+  T_B_T t_b_t;
+  // CHECK-NOT: call
+  T_F_T t_f_t;
+  // CHECK-NOT: call
+  T_FA_T t_fa_t;
+  // CHECK-NOT: call
+  EC_I_EC ec_i_ec;
+  // CHECK:   call void @_ZN7EC_I_ECC1Ev(%struct.EC_I_EC* %ec_i_ec)
+  EC_I_EC1 ec_i_ec1;
+  // CHECK:   call void @_ZN8EC_I_EC1C1Ev(%struct.EC_I_EC1* %ec_i_ec1)
+  T_V_T t_v_t;
+  // CHECK:   call void @_ZN5T_V_TC1Ev(%struct.T_V_T* %t_v_t)
+  T_B_NEC t_b_nec;
+  // CHECK:   call void @_ZN7T_B_NECC1Ev(%struct.T_B_NEC* %t_b_nec)
+  T_F_NEC t_f_nec;
+  // CHECK:   call void @_ZN7T_F_NECC1Ev(%struct.T_F_NEC* %t_f_nec)
+  T_FA_NEC t_fa_nec;
+  // CHECK:   call void @_ZN8T_FA_NECC1Ev(%struct.T_FA_NEC* %t_fa_nec)
+  T_B_NED t_b_ned;
+  // CHECK-NOT: call
+  T_F_NED t_f_ned;
+  // CHECK-NOT: call
+  T_FA_NED t_fa_ned;
+  // CHECK-NOT: call
+  static __shared__ EC s_ec;
+  // CHECK-NOT: call void @_ZN2ECC1Ev(%struct.EC* addrspacecast (%struct.EC addrspace(3)* @_ZZ2dfvE4s_ec to %struct.EC*))
+  static __shared__ ETC s_etc;
+  // CHECK-NOT: call void @_ZN3ETCC1IJEEEDpT_(%struct.ETC* addrspacecast (%struct.ETC addrspace(3)* @_ZZ2dfvE5s_etc to %struct.ETC*))
+
+  // anchor point separating constructors and destructors
+  df(); // CHECK: call void @_Z2dfv()
+
+  // Verify that we only call non-empty destructors
+  // CHECK-NEXT: call void @_ZN8T_FA_NEDD1Ev(%struct.T_FA_NED* %t_fa_ned) #6
+  // CHECK-NEXT: call void @_ZN7T_F_NEDD1Ev(%struct.T_F_NED* %t_f_ned) #6
+  // CHECK-NEXT: call void @_ZN7T_B_NEDD1Ev(%struct.T_B_NED* %t_b_ned) #6
+  // CHECK-NEXT: call void @_ZN2VDD1Ev(%struct.VD* %vd)
+  // CHECK-NEXT: call void @_ZN3NEDD1Ev(%struct.NED* %ned)
+  // CHECK-NEXT: call void @_ZN2UDD1Ev(%struct.UD* %ud)
+  // CHECK-NEXT: call void @_ZN3ECDD1Ev(%struct.ECD* %ecd)
+  // CHECK-NEXT: call void @_ZN2EDD1Ev(%struct.ED* %ed)
+
+  // CHECK-NEXT: ret void
+}
+
+// We should not emit global init function.
+// CHECK-NOT: @_GLOBAL__sub_I
diff --git a/test/CodeGenCUDA/filter-decl.cu b/test/CodeGenCUDA/filter-decl.cu
index 023ae61f3af81..bc744a07a330d 100644
--- a/test/CodeGenCUDA/filter-decl.cu
+++ b/test/CodeGenCUDA/filter-decl.cu
@@ -9,15 +9,15 @@
 // CHECK-DEVICE-NOT: module asm "file scope asm is host only"
 __asm__("file scope asm is host only");
 
-// CHECK-HOST-NOT: constantdata = externally_initialized global
+// CHECK-HOST: constantdata = internal global
 // CHECK-DEVICE: constantdata = externally_initialized global
 __constant__ char constantdata[256];
 
-// CHECK-HOST-NOT: devicedata = externally_initialized global
+// CHECK-HOST: devicedata = internal global
 // CHECK-DEVICE: devicedata = externally_initialized global
 __device__ char devicedata[256];
 
-// CHECK-HOST-NOT: shareddata = global
+// CHECK-HOST: shareddata = internal global
 // CHECK-DEVICE: shareddata = global
 __shared__ char shareddata[256];
 
diff --git a/test/CodeGenCUDA/flush-denormals.cu b/test/CodeGenCUDA/flush-denormals.cu
new file mode 100644
index 0000000000000..e528d7b102d46
--- /dev/null
+++ b/test/CodeGenCUDA/flush-denormals.cu
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fcuda-is-device \
+// RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix NOFTZ
+// RUN: %clang_cc1 -fcuda-is-device -fcuda-flush-denormals-to-zero \
+// RUN:   -triple nvptx-nvidia-cuda -emit-llvm -o - %s | \
+// RUN:   FileCheck %s -check-prefix CHECK -check-prefix FTZ
+
+#include "Inputs/cuda.h"
+
+// Checks that device function calls get emitted with the "ntpvx-f32ftz"
+// attribute set to "true" when we compile CUDA device code with
+// -fcuda-flush-denormals-to-zero.  Further, check that we reflect the presence
+// or absence of -fcuda-flush-denormals-to-zero in a module flag.
+
+// CHECK-LABEL: define void @foo() #0
+extern "C" __device__ void foo() {}
+
+// FTZ: attributes #0 = {{.*}} "nvptx-f32ftz"="true"
+// NOFTZ-NOT: attributes #0 = {{.*}} "nvptx-f32ftz"
+
+// FTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// FTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 1}
+
+// NOFTZ:!llvm.module.flags = !{[[MODFLAG:![0-9]+]]}
+// NOFTZ:[[MODFLAG]] = !{i32 4, !"nvvm-reflect-ftz", i32 0}
diff --git a/test/CodeGenCUDA/fp-contract.cu b/test/CodeGenCUDA/fp-contract.cu
new file mode 100644
index 0000000000000..070ebaea44eed
--- /dev/null
+++ b/test/CodeGenCUDA/fp-contract.cu
@@ -0,0 +1,32 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// By default we should fuse multiply/add into fma instruction.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -disable-llvm-passes -o - %s | FileCheck -check-prefix ENABLED %s
+
+// Explicit -ffp-contract=fast
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=fast -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix ENABLED %s
+
+// Explicit -ffp-contract=on -- fusing by front-end (disabled).
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=on -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+// Explicit -ffp-contract=off should disable instruction fusing.
+// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
+// RUN:   -ffp-contract=off -disable-llvm-passes -o - %s \
+// RUN:   | FileCheck -check-prefix DISABLED %s
+
+
+#include "Inputs/cuda.h"
+
+__host__ __device__ float func(float a, float b, float c) { return a + b * c; }
+// ENABLED:       fma.rn.f32
+// ENABLED-NEXT:  st.param.f32
+
+// DISABLED:      mul.rn.f32
+// DISABLED-NEXT: add.rn.f32
+// DISABLED-NEXT: st.param.f32
diff --git a/test/CodeGenCUDA/function-overload.cu b/test/CodeGenCUDA/function-overload.cu
index a12ef82773a24..380304af82221 100644
--- a/test/CodeGenCUDA/function-overload.cu
+++ b/test/CodeGenCUDA/function-overload.cu
@@ -1,168 +1,18 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// Make sure we handle target overloads correctly.
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
-// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// Make sure we handle target overloads correctly.  Most of this is checked in
+// sema, but special functions like constructors and destructors are here.
+//
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \
-// RUN:     -fcuda-target-overloads -emit-llvm -o - %s \
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm -o - %s \
 // RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s
 
-// Check target overloads handling with disabled call target checks.
-// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST \
-// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-HOST-NC %s
-// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -emit-llvm \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
-// RUN:    -fcuda-is-device -o - %s \
-// RUN:     | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \
-// RUN:    -check-prefix=CHECK-BOTH-NC -check-prefix=CHECK-DEVICE-NC %s
-
 #include "Inputs/cuda.h"
 
-typedef int (*fp_t)(void);
-typedef void (*gp_t)(void);
-
-// CHECK-HOST: @hp = global i32 ()* @_Z1hv
-// CHECK-HOST: @chp = global i32 ()* @ch
-// CHECK-HOST: @dhp = global i32 ()* @_Z2dhv
-// CHECK-HOST: @cdhp = global i32 ()* @cdh
-// CHECK-HOST: @gp = global void ()* @_Z1gv
-
-// CHECK-BOTH-LABEL: define i32 @_Z2dhv()
-__device__ int dh(void) { return 1; }
-// CHECK-DEVICE: ret i32 1
-__host__ int dh(void) { return 2; }
-// CHECK-HOST:   ret i32 2
-
-// CHECK-BOTH-LABEL: define i32 @_Z2hdv()
-__host__ __device__ int hd(void) { return 3; }
-// CHECK-BOTH:   ret i32 3
-
-// CHECK-DEVICE-LABEL: define i32 @_Z1dv()
-__device__ int d(void) { return 8; }
-// CHECK-DEVICE:   ret i32 8
-
-// CHECK-HOST-LABEL: define i32 @_Z1hv()
-__host__ int h(void) { return 9; }
-// CHECK-HOST:   ret i32 9
-
-// CHECK-BOTH-LABEL: define void @_Z1gv()
-__global__ void g(void) {}
-// CHECK-BOTH:   ret void
-
-// mangled names of extern "C" __host__ __device__ functions clash
-// with those of their __host__/__device__ counterparts, so
-// overloading of extern "C" functions can only happen for __host__
-// and __device__ functions -- we never codegen them in the same
-// compilation and therefore mangled name conflict is not a problem.
-
-// CHECK-BOTH-LABEL: define i32 @cdh()
-extern "C" __device__ int cdh(void) {return 10;}
-// CHECK-DEVICE:   ret i32 10
-extern "C" __host__ int cdh(void) {return 11;}
-// CHECK-HOST:     ret i32 11
-
-// CHECK-DEVICE-LABEL: define i32 @cd()
-extern "C" __device__ int cd(void) {return 12;}
-// CHECK-DEVICE:   ret i32 12
-
-// CHECK-HOST-LABEL: define i32 @ch()
-extern "C" __host__ int ch(void) {return 13;}
-// CHECK-HOST:     ret i32 13
-
-// CHECK-BOTH-LABEL: define i32 @chd()
-extern "C" __host__ __device__ int chd(void) {return 14;}
-// CHECK-BOTH:     ret i32 14
-
-// CHECK-HOST-LABEL: define void @_Z5hostfv()
-__host__ void hostf(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-HOST-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-HOST-NC: store {{.*}} @cd, {{.*}} %cdp,
-#endif
-  fp_t hp = h; // CHECK-HOST: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-HOST: store {{.*}} @ch, {{.*}} %chp,
-  fp_t dhp = dh; // CHECK-HOST: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-HOST: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-HOST: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-HOST: store {{.*}} @chd, {{.*}} %chdp,
-  gp_t gp = g; // CHECK-HOST: store {{.*}} @_Z1gv, {{.*}} %gp,
-
-#if defined (NOCHECKS)
-  d();     // CHECK-HOST-NC: call i32 @_Z1dv()
-  cd();    // CHECK-HOST-NC: call i32 @cd()
-#endif
-  h();     // CHECK-HOST: call i32 @_Z1hv()
-  ch();    // CHECK-HOST: call i32 @ch()
-  dh();    // CHECK-HOST: call i32 @_Z2dhv()
-  cdh();   // CHECK-HOST: call i32 @cdh()
-  g<<<0,0>>>();  // CHECK-HOST: call void @_Z1gv()
-}
-
-// CHECK-DEVICE-LABEL: define void @_Z7devicefv()
-__device__ void devicef(void) {
-  fp_t dp = d;   // CHECK-DEVICE: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-DEVICE: store {{.*}} @cd, {{.*}} %cdp,
-#if defined (NOCHECKS)
-  fp_t hp = h; // CHECK-DEVICE-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-DEVICE-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
-  fp_t dhp = dh; // CHECK-DEVICE: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-DEVICE: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-DEVICE: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-DEVICE: store {{.*}} @chd, {{.*}} %chdp,
-
-  d();     // CHECK-DEVICE: call i32 @_Z1dv()
-  cd();    // CHECK-DEVICE: call i32 @cd()
-#if defined (NOCHECKS)
-  h();     // CHECK-DEVICE-NC: call i32 @_Z1hv()
-  ch();    // CHECK-DEVICE-NC: call i32 @ch()
-#endif
-  dh();    // CHECK-DEVICE: call i32 @_Z2dhv()
-  cdh();   // CHECK-DEVICE: call i32 @cdh()
-}
-
-// CHECK-BOTH-LABEL: define void @_Z11hostdevicefv()
-__host__ __device__ void hostdevicef(void) {
-#if defined (NOCHECKS)
-  fp_t dp = d;   // CHECK-BOTH-NC: store {{.*}} @_Z1dv, {{.*}} %dp,
-  fp_t cdp = cd; // CHECK-BOTH-NC: store {{.*}} @cd, {{.*}} %cdp,
-  fp_t hp = h; // CHECK-BOTH-NC: store {{.*}} @_Z1hv, {{.*}} %hp,
-  fp_t chp = ch; // CHECK-BOTH-NC: store {{.*}} @ch, {{.*}} %chp,
-#endif
-  fp_t dhp = dh; // CHECK-BOTH: store {{.*}} @_Z2dhv, {{.*}} %dhp,
-  fp_t cdhp = cdh; // CHECK-BOTH: store {{.*}} @cdh, {{.*}} %cdhp,
-  fp_t hdp = hd; // CHECK-BOTH: store {{.*}} @_Z2hdv, {{.*}} %hdp,
-  fp_t chdp = chd; // CHECK-BOTH: store {{.*}} @chd, {{.*}} %chdp,
-#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
-  gp_t gp = g; // CHECK-HOST-NC: store {{.*}} @_Z1gv, {{.*}} %gp,
-#endif
-
-#if defined (NOCHECKS)
-  d();     // CHECK-BOTH-NC: call i32 @_Z1dv()
-  cd();    // CHECK-BOTH-NC: call i32 @cd()
-  h();     // CHECK-BOTH-NC: call i32 @_Z1hv()
-  ch();    // CHECK-BOTH-NC: call i32 @ch()
-#endif
-  dh();    // CHECK-BOTH: call i32 @_Z2dhv()
-  cdh();   // CHECK-BOTH: call i32 @cdh()
-#if defined (NOCHECKS) && !defined(__CUDA_ARCH__)
-  g<<<0,0>>>();  // CHECK-HOST-NC: call void @_Z1gv()
-#endif
-}
-
-// Test for address of overloaded function resolution in the global context.
-fp_t hp = h;
-fp_t chp = ch;
-fp_t dhp = dh;
-fp_t cdhp = cdh;
-gp_t gp = g;
-
-int x;
 // Check constructors/destructors for D/H functions
+int x;
 struct s_cd_dh {
   __host__ s_cd_dh() { x = 11; }
   __device__ s_cd_dh() { x = 12; }
@@ -211,4 +61,3 @@ void wrapper() {
 // CHECK-HOST:   store i32 21,
 // CHECK-DEVICE: store i32 22,
 // CHECK-BOTH: ret void
-
diff --git a/test/CodeGenCUDA/host-device-calls-host.cu b/test/CodeGenCUDA/host-device-calls-host.cu
index 8140f619361bc..94796a3c233cc 100644
--- a/test/CodeGenCUDA/host-device-calls-host.cu
+++ b/test/CodeGenCUDA/host-device-calls-host.cu
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-allow-host-calls-from-host-device -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple nvptx-unknown-unknown -fcuda-is-device -Wno-cuda-compat -emit-llvm -o - | FileCheck %s
 
 #include "Inputs/cuda.h"
 
diff --git a/test/CodeGenCUDA/launch-bounds.cu b/test/CodeGenCUDA/launch-bounds.cu
index ecbd0ad70580c..6c369c6f3f0da 100644
--- a/test/CodeGenCUDA/launch-bounds.cu
+++ b/test/CodeGenCUDA/launch-bounds.cu
@@ -79,3 +79,8 @@ Kernel7()
 }
 // CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"maxntidx",
 // CHECK-NOT: !{{[0-9]+}} = !{void ()* @{{.*}}Kernel7{{.*}}, !"minctasm",
+
+const char constchar = 12;
+__global__ void __launch_bounds__(constint, constchar) Kernel8() {}
+// CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"maxntidx", i32 100
+// CHECK:     !{{[0-9]+}} = !{void ()* @{{.*}}Kernel8{{.*}}, !"minctasm", i32 12
diff --git a/test/CodeGenCUDA/link-device-bitcode.cu b/test/CodeGenCUDA/link-device-bitcode.cu
index de3d39c20b499..869fcb1bc9380 100644
--- a/test/CodeGenCUDA/link-device-bitcode.cu
+++ b/test/CodeGenCUDA/link-device-bitcode.cu
@@ -4,10 +4,10 @@
 // REQUIRES: nvptx-registered-target
 //
 // Prepare bitcode file to link with
-// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
-// RUN:    %S/Inputs/device-code.ll
-// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t-2.bc \
-// RUN:    %S/Inputs/device-code-2.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc \
+// RUN:    -disable-llvm-passes -o %t.bc %S/Inputs/device-code.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc \
+// RUN:    -disable-llvm-passes -o %t-2.bc %S/Inputs/device-code-2.ll
 //
 // Make sure function in device-code gets linked in and internalized.
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
diff --git a/test/CodeGenCUDA/printf-aggregate.cu b/test/CodeGenCUDA/printf-aggregate.cu
new file mode 100644
index 0000000000000..2e703b81d09b7
--- /dev/null
+++ b/test/CodeGenCUDA/printf-aggregate.cu
@@ -0,0 +1,17 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: not %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s 2>&1 | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+// Check that we don't crash when asked to printf a non-scalar arg.
+struct Struct {
+  int x;
+  int y;
+};
+__device__ void PrintfNonScalar() {
+  // CHECK: cannot compile this non-scalar arg to printf
+  printf("%d", Struct());
+}
diff --git a/test/CodeGenCUDA/printf.cu b/test/CodeGenCUDA/printf.cu
new file mode 100644
index 0000000000000..dc3f4ea788f28
--- /dev/null
+++ b/test/CodeGenCUDA/printf.cu
@@ -0,0 +1,43 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -emit-llvm \
+// RUN:   -o - %s | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+extern "C" __device__ int vprintf(const char*, const char*);
+
+// Check a simple call to printf end-to-end.
+// CHECK: [[SIMPLE_PRINTF_TY:%[a-zA-Z0-9_]+]] = type { i32, i64, double }
+__device__ int CheckSimple() {
+  // CHECK: [[BUF:%[a-zA-Z0-9_]+]] = alloca [[SIMPLE_PRINTF_TY]]
+  // CHECK: [[FMT:%[0-9]+]] = load{{.*}}%fmt
+  const char* fmt = "%d %lld %f";
+  // CHECK: [[PTR0:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 0
+  // CHECK: store i32 1, i32* [[PTR0]], align 4
+  // CHECK: [[PTR1:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 1
+  // CHECK: store i64 2, i64* [[PTR1]], align 8
+  // CHECK: [[PTR2:%[0-9]+]] = getelementptr inbounds [[SIMPLE_PRINTF_TY]], [[SIMPLE_PRINTF_TY]]* [[BUF]], i32 0, i32 2
+  // CHECK: store double 3.0{{[^,]*}}, double* [[PTR2]], align 8
+  // CHECK: [[BUF_CAST:%[0-9]+]] = bitcast [[SIMPLE_PRINTF_TY]]* [[BUF]] to i8*
+  // CHECK: [[RET:%[0-9]+]] = call i32 @vprintf(i8* [[FMT]], i8* [[BUF_CAST]])
+  // CHECK: ret i32 [[RET]]
+  return printf(fmt, 1, 2ll, 3.0);
+}
+
+__device__ void CheckNoArgs() {
+  // CHECK: call i32 @vprintf({{.*}}, i8* null){{$}}
+  printf("hello, world!");
+}
+
+// Check that printf's alloca happens in the entry block, not inside the if
+// statement.
+__device__ bool foo();
+__device__ void CheckAllocaIsInEntryBlock() {
+  // CHECK: alloca %printf_args
+  // CHECK: call {{.*}} @_Z3foov()
+  if (foo()) {
+    printf("%d", 42);
+  }
+}
diff --git a/test/CodeGenCUDA/ptx-kernels.cu b/test/CodeGenCUDA/ptx-kernels.cu
index 6280e604f2edb..1d330bdf6a49d 100644
--- a/test/CodeGenCUDA/ptx-kernels.cu
+++ b/test/CodeGenCUDA/ptx-kernels.cu
@@ -19,8 +19,17 @@ __global__ void global_function() {
 
 // Make sure host-instantiated kernels are preserved on device side.
 template <typename T> __global__ void templated_kernel(T param) {}
-// CHECK-LABEL: define weak_odr void @_Z16templated_kernelIiEvT_
-void host_function() { templated_kernel<<<0,0>>>(0); }
+// CHECK-DAG: define void @_Z16templated_kernelIiEvT_(
+
+namespace {
+__global__ void anonymous_ns_kernel() {}
+// CHECK-DAG: define void @_ZN12_GLOBAL__N_119anonymous_ns_kernelEv(
+}
+
+void host_function() {
+  templated_kernel<<<0, 0>>>(0);
+  anonymous_ns_kernel<<<0,0>>>();
+}
 
 // CHECK: !{{[0-9]+}} = !{void ()* @global_function, !"kernel", i32 1}
 // CHECK: !{{[0-9]+}} = !{void (i32)* @_Z16templated_kernelIiEvT_, !"kernel", i32 1}
diff --git a/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp b/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
index 3828388d48eaa..dd5fa3e08fc2a 100644
--- a/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
+++ b/test/CodeGenCXX/2009-05-04-PureConstNounwind.cpp
@@ -12,10 +12,11 @@ int f(void) {
 
 // CHECK: declare i32 @_Z1cv() [[NUW_RN:#[0-9]+]]
 // CHECK: declare i32 @_Z1pv() [[NUW_RO:#[0-9]+]]
-// CHECK: declare i32 @_Z1tv() [[TF]]
+// CHECK: declare i32 @_Z1tv() [[TF2:#[0-9]+]]
 
 // CHECK: attributes [[TF]] = { {{.*}} }
 // CHECK: attributes [[NUW_RN]] = { nounwind readnone{{.*}} }
 // CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[TF2]] = { {{.*}} }
 // CHECK: attributes [[NUW_RN_CALL]] = { nounwind readnone }
 // CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp b/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
index 17fa456a45d91..34a1cfa00e2ce 100644
--- a/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
+++ b/test/CodeGenCXX/Inputs/debug-info-class-limited.cpp
@@ -1,5 +1,6 @@
 
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "PR16214",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "PR16214",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct PR16214 {
   int i;
 };
@@ -10,7 +11,8 @@ bar *a;
 bar b;
 
 namespace PR14467 {
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct foo {
 };
 
@@ -21,7 +23,7 @@ foo *bar(foo *a) {
 }
 
 namespace test1 {
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} flags: DIFlagFwdDecl
 struct foo {
 };
 
@@ -35,7 +37,8 @@ namespace test2 {
 // FIXME: if we were a bit fancier, we could realize that the 'foo' type is only
 // required because of the 'bar' type which is not required at all (or might
 // only be required to be declared)
-// CHECK-DAG: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+1]],{{.*}} isDefinition: true
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",{{.*}} line: [[@LINE+2]],{{.*}}
+// CHECK-NOT: DIFlagFwdDecl
 struct foo {
 };
 
diff --git a/test/CodeGenCXX/PR20038.cpp b/test/CodeGenCXX/PR20038.cpp
index 2d7043dcda331..095705f389bf6 100644
--- a/test/CodeGenCXX/PR20038.cpp
+++ b/test/CodeGenCXX/PR20038.cpp
@@ -7,8 +7,8 @@ extern bool b;
 // CHECK: call {{.*}}, !dbg [[DTOR_CALL1_LOC:![0-9]*]]
 // CHECK: call {{.*}}, !dbg [[DTOR_CALL2_LOC:![0-9]*]]
 // CHECK: [[FUN1:.*]] = distinct !DISubprogram(name: "fun1",{{.*}} isDefinition: true
-// CHECK: [[FUN2:.*]] = distinct !DISubprogram(name: "fun2",{{.*}} isDefinition: true
 // CHECK: [[DTOR_CALL1_LOC]] = !DILocation(line: [[@LINE+1]], scope: [[FUN1]])
 void fun1() { b && (C(), 1); }
+// CHECK: [[FUN2:.*]] = distinct !DISubprogram(name: "fun2",{{.*}} isDefinition: true
 // CHECK: [[DTOR_CALL2_LOC]] = !DILocation(line: [[@LINE+1]], scope: [[FUN2]])
 bool fun2() { return (C(), b) && 0; }
diff --git a/test/CodeGenCXX/PR26569.cpp b/test/CodeGenCXX/PR26569.cpp
new file mode 100644
index 0000000000000..3e2d2ffeba149
--- /dev/null
+++ b/test/CodeGenCXX/PR26569.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple i686-pc-win32 -fms-extensions -emit-llvm -O1 -disable-llvm-optzns %s -o - | FileCheck %s
+
+class A {
+  virtual void m_fn1();
+};
+template <typename>
+class B : virtual A {};
+
+extern template class __declspec(dllimport) B<int>;
+class __declspec(dllexport) C : B<int> {};
+
+// CHECK-DAG: @[[VTABLE_C:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4C@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)]
+// CHECK-DAG: @[[VTABLE_B:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4?$B@H@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)], comdat($"\01??_S?$B@H@@6B@")
+// CHECK-DAG: @[[VTABLE_A:.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4A@@6B@" to i8*), i8* bitcast (void (%class.A*)* @"\01?m_fn1@A@@EAEXXZ" to i8*)], comdat($"\01??_7A@@6B@")
+
+// CHECK-DAG: @"\01??_7C@@6B@" = dllexport unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_C]], i32 0, i32 1)
+// CHECK-DAG: @"\01??_S?$B@H@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_B]], i32 0, i32 1)
+// CHECK-DAG: @"\01??_7A@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[VTABLE_A]], i32 0, i32 1)
+
+// CHECK-DAG: @"\01??_8?$B@H@@7B@" = available_externally dllimport unnamed_addr constant [2 x i32] [i32 0, i32 4]
diff --git a/test/CodeGenCXX/PR28220.cpp b/test/CodeGenCXX/PR28220.cpp
new file mode 100644
index 0000000000000..6262c87de2062
--- /dev/null
+++ b/test/CodeGenCXX/PR28220.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -emit-llvm -o - | FileCheck %s
+
+template <typename>
+struct __declspec(dllimport) S {
+  S();
+};
+
+template <typename T>
+struct __declspec(dllimport) U {
+  static S<T> u;
+};
+
+template <typename T>
+S<T> U<T>::u;
+
+template S<int> U<int>::u;
+// CHECK-NOT: define internal void @"\01??__Eu@?$U@H@@2U?$S@H@@A@YAXXZ"(
+
+S<int> &i = U<int>::u;
diff --git a/test/CodeGenCXX/align-avx-complete-objects.cpp b/test/CodeGenCXX/align-avx-complete-objects.cpp
index 6ab17f5d656b3..ad4a91428d254 100644
--- a/test/CodeGenCXX/align-avx-complete-objects.cpp
+++ b/test/CodeGenCXX/align-avx-complete-objects.cpp
@@ -13,7 +13,7 @@ volatile float TestAlign(void)
 }
 
 // CHECK: [[R:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:  [[CALL:%.*]] = call noalias i8* @_Znwm(i64 32)
+// CHECK-NEXT:  [[CALL:%.*]] = call i8* @_Znwm(i64 32)
 // CHECK-NEXT:  [[ZERO:%.*]] = bitcast i8* [[CALL]] to <8 x float>*
 // CHECK-NEXT:  store <8 x float>* [[ZERO]], <8 x float>** [[P:%.*]], align 8
 // CHECK-NEXT:  [[ONE:%.*]] = load <8 x float>*, <8 x float>** [[P]], align 8
@@ -42,7 +42,7 @@ volatile float TestAlign2(void)
 }
 
 // CHECK: [[R:%.*]] = alloca <8 x float>, align 32
-// CHECK-NEXT:  [[CALL:%.*]] = call noalias i8* @_Znwm(i64 32)
+// CHECK-NEXT:  [[CALL:%.*]] = call i8* @_Znwm(i64 32)
 // CHECK-NEXT:  [[ZERO:%.*]] = bitcast i8* [[CALL]] to <8 x float>*
 // CHECK-NEXT:  store <8 x float>* [[ZERO]], <8 x float>** [[P:%.*]], align 8
 // CHECK-NEXT:  [[ONE:%.*]] = load <8 x float>*, <8 x float>** [[P]], align 8
diff --git a/test/CodeGenCXX/alignment.cpp b/test/CodeGenCXX/alignment.cpp
index 2a1fe71f961b7..4c44badd21cda 100644
--- a/test/CodeGenCXX/alignment.cpp
+++ b/test/CodeGenCXX/alignment.cpp
@@ -32,7 +32,7 @@ namespace test0 {
     // CHECK: [[T2:%.*]] = or i8 [[T1]], [[T0]]
     // CHECK: store i8 [[T2]], i8* [[FIELD_P]], align 4
     b.onebit = int_source();
-    
+
     // CHECK: [[B_P:%.*]] = load [[B]]*, [[B]]**
     // CHECK: [[FIELD_P:%.*]] = bitcast [[B]]* [[B_P]] to i8*
     // CHECK: [[VALUE:%.*]] = load i8, i8* [[FIELD_P]], align 4
@@ -60,7 +60,7 @@ namespace test0 {
     // CHECK: [[T2:%.*]] = or i8 [[T1]], [[T0]]
     // CHECK: store i8 [[T2]], i8* [[FIELD_P]], align 2
     c.onebit = int_source();
-    
+
     // CHECK: [[C_P:%.*]] = load [[C]]*, [[C]]**
     // CHECK: [[T0:%.*]] = bitcast [[C]]* [[C_P]] to i8*
     // CHECK: [[T1:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 8
diff --git a/test/CodeGenCXX/arm-swiftcall.cpp b/test/CodeGenCXX/arm-swiftcall.cpp
new file mode 100644
index 0000000000000..535350c808d3a
--- /dev/null
+++ b/test/CodeGenCXX/arm-swiftcall.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -triple armv7-apple-darwin9 -emit-llvm -o - %s -Wno-return-type-c-linkage | FileCheck %s
+
+// This isn't really testing anything ARM-specific; it's just a convenient
+// 32-bit platform.
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define OUT __attribute__((swift_indirect_result))
+#define ERROR __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+/*****************************************************************************/
+/********************************** LOWERING *********************************/
+/*****************************************************************************/
+
+#define TEST(TYPE)                                  \
+  extern "C" SWIFTCALL TYPE return_##TYPE(void) {   \
+    TYPE result = {};                               \
+    return result;                                  \
+  }                                                 \
+  extern "C" SWIFTCALL void take_##TYPE(TYPE v) {   \
+  }                                                 \
+  extern "C" void test_##TYPE() {                   \
+    take_##TYPE(return_##TYPE());                   \
+  }
+
+/*****************************************************************************/
+/*********************************** STRUCTS *********************************/
+/*****************************************************************************/
+
+typedef struct {
+} struct_empty;
+TEST(struct_empty);
+// CHECK-LABEL: define {{.*}} @return_struct_empty()
+// CHECK:   ret void
+// CHECK-LABEL: define {{.*}} @take_struct_empty()
+// CHECK:   ret void
+
+// This is only properly testable in C++ because it relies on empty structs
+// actually taking up space in a structure without requiring any extra data
+// to be passed.
+typedef struct {
+  int x;
+  struct_empty padding[2];
+  char c1;
+  float f0;
+  float f1;
+} struct_1;
+TEST(struct_1);
+// CHECK-LABEL: define {{.*}} @return_struct_1()
+// CHECK:   [[RET:%.*]] = alloca [[REC:%.*]], align 4
+// CHECK:   @llvm.memset
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[RET]] to [[AGG:{ i32, \[2 x i8\], i8, \[1 x i8\], float, float }]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[SECOND:%.*]] = load i8, i8* [[T0]], align 2
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 5
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = insertvalue [[UAGG:{ i32, i8, float, float }]] undef, i32 [[FIRST]], 0
+// CHECK:   [[T1:%.*]] = insertvalue [[UAGG]] [[T0]], i8 [[SECOND]], 1
+// CHECK:   [[T2:%.*]] = insertvalue [[UAGG]] [[T1]], float [[THIRD]], 2
+// CHECK:   [[T3:%.*]] = insertvalue [[UAGG]] [[T2]], float [[FOURTH]], 3
+// CHECK:   ret [[UAGG]] [[T3]]
+// CHECK-LABEL: define {{.*}} @take_struct_1(i32, i8, float, float)
+// CHECK:   [[V:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[V]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   store i32 %0, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   store i8 %1, i8* [[T0]], align 2
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   store float %2, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 5
+// CHECK:   store float %3, float* [[T0]], align 4
+// CHECK:   ret void
+// CHECK-LABEL: define void @test_struct_1()
+// CHECK:   [[TMP:%.*]] = alloca [[REC]], align 4
+// CHECK:   [[CALL:%.*]] = call [[SWIFTCC:swiftcc]] [[UAGG]] @return_struct_1()
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 0
+// CHECK:   store i32 [[T1]], i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 1
+// CHECK:   store i8 [[T1]], i8* [[T0]], align 2
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 2
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 5
+// CHECK:   [[T1:%.*]] = extractvalue [[UAGG]] [[CALL]], 3
+// CHECK:   store float [[T1]], float* [[T0]], align 4
+// CHECK:   [[CAST_TMP:%.*]] = bitcast [[REC]]* [[TMP]] to [[AGG]]*
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 0
+// CHECK:   [[FIRST:%.*]] = load i32, i32* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 2
+// CHECK:   [[SECOND:%.*]] = load i8, i8* [[T0]], align 2
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 4
+// CHECK:   [[THIRD:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   [[T0:%.*]] = getelementptr inbounds [[AGG]], [[AGG]]* [[CAST_TMP]], i32 0, i32 5
+// CHECK:   [[FOURTH:%.*]] = load float, float* [[T0]], align 4
+// CHECK:   call [[SWIFTCC]] void @take_struct_1(i32 [[FIRST]], i8 [[SECOND]], float [[THIRD]], float [[FOURTH]])
+// CHECK:   ret void
+
+struct struct_indirect_1 {
+  int x;
+  ~struct_indirect_1();
+};
+TEST(struct_indirect_1)
+
+// CHECK-LABEL: define {{.*}} void @return_struct_indirect_1({{.*}} noalias sret
+
+// Should not be byval.
+// CHECK-LABEL: define {{.*}} void @take_struct_indirect_1({{.*}}*{{( %.*)?}})
diff --git a/test/CodeGenCXX/arm.cpp b/test/CodeGenCXX/arm.cpp
index 11ae6b24c26e2..d0b896d182da9 100644
--- a/test/CodeGenCXX/arm.cpp
+++ b/test/CodeGenCXX/arm.cpp
@@ -109,7 +109,7 @@ namespace test3 {
 
   void a() {
     // CHECK-LABEL: define void @_ZN5test31aEv()
-    // CHECK: call noalias i8* @_Znam(i32 48)
+    // CHECK: call i8* @_Znam(i32 48)
     // CHECK: store i32 4
     // CHECK: store i32 10
     A *x = new A[10];
@@ -122,7 +122,7 @@ namespace test3 {
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[OR:%.*]] = or i1
     // CHECK: [[SZ:%.*]] = select i1 [[OR]]
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[N]]
     A *x = new A[n];
@@ -130,7 +130,7 @@ namespace test3 {
 
   void c() {
     // CHECK-LABEL: define void @_ZN5test31cEv()
-    // CHECK: call  noalias i8* @_Znam(i32 808)
+    // CHECK: call  i8* @_Znam(i32 808)
     // CHECK: store i32 4
     // CHECK: store i32 200
     A (*x)[20] = new A[10][20];
@@ -143,7 +143,7 @@ namespace test3 {
     // CHECK: [[NE:%.*]] = mul i32 [[N]], 20
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[NE]]
     A (*x)[20] = new A[n][20];
@@ -182,7 +182,7 @@ namespace test4 {
 
   void a() {
     // CHECK-LABEL: define void @_ZN5test41aEv()
-    // CHECK: call noalias i8* @_Znam(i32 48)
+    // CHECK: call i8* @_Znam(i32 48)
     // CHECK: store i32 4
     // CHECK: store i32 10
     A *x = new A[10];
@@ -194,7 +194,7 @@ namespace test4 {
     // CHECK: @llvm.umul.with.overflow.i32(i32 [[N]], i32 4)
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[N]]
     A *x = new A[n];
@@ -202,7 +202,7 @@ namespace test4 {
 
   void c() {
     // CHECK-LABEL: define void @_ZN5test41cEv()
-    // CHECK: call  noalias i8* @_Znam(i32 808)
+    // CHECK: call  i8* @_Znam(i32 808)
     // CHECK: store i32 4
     // CHECK: store i32 200
     A (*x)[20] = new A[10][20];
@@ -215,7 +215,7 @@ namespace test4 {
     // CHECK: [[NE:%.*]] = mul i32 [[N]], 20
     // CHECK: @llvm.uadd.with.overflow.i32(i32 {{.*}}, i32 8)
     // CHECK: [[SZ:%.*]] = select
-    // CHECK: call noalias i8* @_Znam(i32 [[SZ]])
+    // CHECK: call i8* @_Znam(i32 [[SZ]])
     // CHECK: store i32 4
     // CHECK: store i32 [[NE]]
     A (*x)[20] = new A[n][20];
@@ -383,7 +383,7 @@ namespace test9 {
 // CHECK-NEXT: [[OVERFLOW:%.*]] = or i1 [[O0]], [[O1]]
 // CHECK-NEXT: [[T3:%.*]] = extractvalue { i32, i1 } [[T2]], 0
 // CHECK-NEXT: [[T4:%.*]] = select i1 [[OVERFLOW]], i32 -1, i32 [[T3]]
-// CHECK-NEXT: [[ALLOC:%.*]] = call noalias i8* @_Znam(i32 [[T4]])
+// CHECK-NEXT: [[ALLOC:%.*]] = call i8* @_Znam(i32 [[T4]])
 // CHECK-NEXT: [[T0:%.*]] = bitcast i8* [[ALLOC]] to i32*
 // CHECK-NEXT: store i32 16, i32* [[T0]]
 // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[T0]], i32 1
diff --git a/test/CodeGenCXX/atomicinit.cpp b/test/CodeGenCXX/atomicinit.cpp
index 5e5174bd06e4b..96f44d52f2861 100644
--- a/test/CodeGenCXX/atomicinit.cpp
+++ b/test/CodeGenCXX/atomicinit.cpp
@@ -1,13 +1,13 @@
 // RUN: %clang_cc1 %s -emit-llvm -O1 -o - -triple=i686-apple-darwin9 -std=c++11 | FileCheck %s
 
-// CHECK-DAG: @PR22043 = global i32 0, align 4
+// CHECK-DAG: @PR22043 = local_unnamed_addr global i32 0, align 4
 typedef _Atomic(int) AtomicInt;
 AtomicInt PR22043 = AtomicInt();
 
-// CHECK-DAG: @_ZN7PR180978constant1aE = global { i16, i8 } { i16 1, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1bE = global { i16, i8 } { i16 2, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1cE = global { i16, i8 } { i16 3, i8 6 }, align 4
-// CHECK-DAG: @_ZN7PR180978constant1yE = global { { i16, i8 }, i32 } { { i16, i8 } { i16 4, i8 6 }, i32 5 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1aE = local_unnamed_addr global { i16, i8 } { i16 1, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1bE = local_unnamed_addr global { i16, i8 } { i16 2, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1cE = local_unnamed_addr global { i16, i8 } { i16 3, i8 6 }, align 4
+// CHECK-DAG: @_ZN7PR180978constant1yE = local_unnamed_addr global { { i16, i8 }, i32 } { { i16, i8 } { i16 4, i8 6 }, i32 5 }, align 4
 
 struct A {
   _Atomic(int) i;
diff --git a/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp b/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp
new file mode 100644
index 0000000000000..6373cf0b2e0a6
--- /dev/null
+++ b/test/CodeGenCXX/attr-mode-vector-types-tmpl.cpp
@@ -0,0 +1,108 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s
+
+template <class T>
+void CheckIntScalarTypes() {
+  // T will be substituted with 'int' and 'enum' types.
+
+  typedef T __attribute__((mode(QI))) T1;
+  typedef T __attribute__((mode(HI))) T2;
+  typedef T __attribute__((mode(SI))) T3;
+  typedef T __attribute__((mode(DI))) T4;
+
+  T1 a1;
+  T2 a2;
+  T3 a3;
+  T4 a4;
+}
+
+template <class T>
+void CheckIntVectorTypes() {
+  // T will be substituted with 'int'.
+
+  typedef int __attribute__((mode(QI))) __attribute__((vector_size(8)))  VT_11;
+  typedef T   __attribute__((mode(V8QI)))                                VT_12;
+  typedef int __attribute__((mode(SI))) __attribute__((vector_size(16))) VT_21;
+  typedef T   __attribute__((mode(V4SI)))                                VT_22;
+  typedef int __attribute__((mode(DI))) __attribute__((vector_size(64))) VT_31;
+  typedef T   __attribute__((mode(V8DI)))                                VT_32;
+
+  VT_11 v11;
+  VT_12 v12;
+
+  VT_21 v21;
+  VT_22 v22;
+
+  VT_31 v31;
+  VT_32 v32;
+}
+
+template <class T>
+void CheckFloatVectorTypes() {
+  // T will be substituted with 'float'.
+
+  typedef float __attribute__((mode(SF))) __attribute__((vector_size(128))) VT_41;
+  typedef T     __attribute__((mode(V32SF)))                                VT_42;
+  typedef float __attribute__((mode(DF))) __attribute__((vector_size(256))) VT_51;
+  typedef T     __attribute__((mode(V32DF)))                                VT_52;
+
+  VT_41 v41;
+  VT_42 v42;
+
+  VT_51 v51;
+  VT_52 v52;
+}
+
+template <class T>
+void CheckInstantiationWithModedType() {
+  T x1;
+}
+
+typedef enum { A1, B1 }                       EnumTy;
+typedef int __attribute__((mode(DI)))         Int64Ty1;
+typedef enum __attribute__((mode(DI))) { A2 } Int64Ty2;
+typedef int __attribute__((mode(V8HI)))       IntVecTy1;
+
+void test() {
+
+  // CHECK: define {{.*}} void @_Z19CheckIntScalarTypesIiEvv()
+  // CHECK: %{{.+}} = alloca i8
+  // CHECK: %{{.+}} = alloca i16
+  // CHECK: %{{.+}} = alloca i32
+  // CHECK: %{{.+}} = alloca i64
+  CheckIntScalarTypes<int>();
+
+  // CHECK: define {{.*}} void @_Z19CheckIntScalarTypesI6EnumTyEvv()
+  // CHECK: %{{.+}} = alloca i8
+  // CHECK: %{{.+}} = alloca i16
+  // CHECK: %{{.+}} = alloca i32
+  // CHECK: %{{.+}} = alloca i64
+  CheckIntScalarTypes<EnumTy>();
+
+  // CHECK: define {{.*}} void @_Z19CheckIntVectorTypesIiEvv()
+  // CHECK: %{{.+}} = alloca <8 x i8>
+  // CHECK: %{{.+}} = alloca <8 x i8>
+  // CHECK: %{{.+}} = alloca <4 x i32>
+  // CHECK: %{{.+}} = alloca <4 x i32>
+  // CHECK: %{{.+}} = alloca <8 x i64>
+  // CHECK: %{{.+}} = alloca <8 x i64>
+  CheckIntVectorTypes<int>();
+
+  // CHECK: define {{.*}} void @_Z21CheckFloatVectorTypesIfEvv()
+  // CHECK: %{{.+}} = alloca <32 x float>
+  // CHECK: %{{.+}} = alloca <32 x float>
+  // CHECK: %{{.+}} = alloca <32 x double>
+  // CHECK: %{{.+}} = alloca <32 x double>
+  CheckFloatVectorTypes<float>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeIlEvv()
+  // CHECK: [[X1:%.+]] = alloca i64
+  CheckInstantiationWithModedType<Int64Ty1>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeI8Int64Ty2Evv()
+  // CHECK: [[X1]] = alloca i64
+  CheckInstantiationWithModedType<Int64Ty2>();
+
+  // CHECK: define {{.*}} void @_Z31CheckInstantiationWithModedTypeIDv8_sEvv()
+  // CHECK: [[X1]] = alloca <8 x i16>
+  CheckInstantiationWithModedType<IntVecTy1>();
+}
diff --git a/test/CodeGenCXX/attr-x86-interrupt.cpp b/test/CodeGenCXX/attr-x86-interrupt.cpp
new file mode 100644
index 0000000000000..5000104690350
--- /dev/null
+++ b/test/CodeGenCXX/attr-x86-interrupt.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_LINUX
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_LINUX
+// RUN: %clang_cc1 -triple x86_64-pc-win32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_WIN
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_WIN
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnux32 %s -emit-llvm -o - | FileCheck %s --check-prefix=X86_64_LINUX
+
+#ifdef __x86_64__
+typedef __UINT64_TYPE__ uword;
+#else
+typedef __UINT32_TYPE__ uword;
+#endif
+
+__attribute__((interrupt)) void foo7(int *a, uword b) {}
+namespace S {
+__attribute__((interrupt)) void foo8(int *a) {}
+}
+struct St {
+static void foo9(int *a) __attribute__((interrupt)) {}
+};
+// X86_64_LINUX: @llvm.used = appending global [3 x i8*] [i8* bitcast (void (i32*, i64)* @{{.*}}foo7{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo8{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo9{{.*}} to i8*)], section "llvm.metadata"
+// X86_64_LINUX: define x86_intrcc void @{{.*}}foo7{{.*}}(i32* %{{.+}}, i64 %{{.+}})
+// X86_64_LINUX: define x86_intrcc void @{{.*}}foo8{{.*}}(i32* %{{.+}})
+// X86_64_LINUX: define linkonce_odr x86_intrcc void @{{.*}}foo9{{.*}}(i32* %{{.+}})
+// X86_LINUX: @llvm.used = appending global [3 x i8*] [i8* bitcast (void (i32*, i32)* @{{.*}}foo7{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo8{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo9{{.*}} to i8*)], section "llvm.metadata"
+// X86_LINUX: define x86_intrcc void @{{.*}}foo7{{.*}}(i32* %{{.+}}, i32 %{{.+}})
+// X86_LINUX: define x86_intrcc void @{{.*}}foo8{{.*}}(i32* %{{.+}})
+// X86_LINUX: define linkonce_odr x86_intrcc void @{{.*}}foo9{{.*}}(i32* %{{.+}})
+// X86_64_WIN: @llvm.used = appending global [3 x i8*] [i8* bitcast (void (i32*, i64)* @{{.*}}foo7{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo8{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo9{{.*}} to i8*)], section "llvm.metadata"
+// X86_64_WIN: define x86_intrcc void @{{.*}}foo7{{.*}}(i32* %{{.+}}, i64 %{{.+}})
+// X86_64_WIN: define x86_intrcc void @{{.*}}foo8{{.*}}(i32* %{{.+}})
+// X86_64_WIN: define linkonce_odr x86_intrcc void @{{.*}}foo9{{.*}}(i32* %{{.+}})
+// X86_WIN: @llvm.used = appending global [3 x i8*] [i8* bitcast (void (i32*, i32)* @{{.*}}foo7{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo8{{.*}} to i8*), i8* bitcast (void (i32*)* @{{.*}}foo9{{.*}} to i8*)], section "llvm.metadata"
+// X86_WIN: define x86_intrcc void @{{.*}}foo7{{.*}}(i32* %{{.+}}, i32 %{{.+}})
+// X86_WIN: define x86_intrcc void @{{.*}}foo8{{.*}}(i32* %{{.+}})
+// X86_WIN: define linkonce_odr x86_intrcc void @{{.*}}foo9{{.*}}(i32* %{{.+}})
diff --git a/test/CodeGenCXX/c-linkage.cpp b/test/CodeGenCXX/c-linkage.cpp
index a70a22ef08c79..0f4c3277253f7 100644
--- a/test/CodeGenCXX/c-linkage.cpp
+++ b/test/CodeGenCXX/c-linkage.cpp
@@ -15,10 +15,10 @@ extern "C" {
 extern "C" {
   static void test2_f() {
   }
-  // CHECK-LABEL: define internal {{.*}}void @_Z7test2_fv
+  // CHECK-LABEL: define internal {{.*}}void @_ZL7test2_fv
   static void test2_f(int x) {
   }
-  // CHECK-LABEL: define internal {{.*}}void @_Z7test2_fi
+  // CHECK-LABEL: define internal {{.*}}void @_ZL7test2_fi
   void test2_use() {
     test2_f();
     test2_f(42);
diff --git a/test/CodeGenCXX/cfi-blacklist.cpp b/test/CodeGenCXX/cfi-blacklist.cpp
index 32ed05bcc520b..af8a10601d292 100644
--- a/test/CodeGenCXX/cfi-blacklist.cpp
+++ b/test/CodeGenCXX/cfi-blacklist.cpp
@@ -1,9 +1,8 @@
-// RUN: echo "type:attr:uuid" > %t.txt
-// RUN: %clang_cc1 -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOUUID %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fvisibility hidden -fms-extensions -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOBL %s
 // RUN: echo "type:std::*" > %t.txt
-// RUN: %clang_cc1 -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTD %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fvisibility hidden -fms-extensions -fsanitize=cfi-vcall -fsanitize-blacklist=%t.txt -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTD %s
 
-struct __declspec(uuid("00000000-0000-0000-0000-000000000000")) S1 {
+struct S1 {
   virtual void f();
 };
 
@@ -16,15 +15,15 @@ struct S2 {
 }
 
 // CHECK: define{{.*}}s1f
-// NOSTD: llvm.bitset.test
-// NOUUID-NOT: llvm.bitset.test
+// NOBL: llvm.type.test
+// NOSTD: llvm.type.test
 void s1f(S1 *s1) {
   s1->f();
 }
 
 // CHECK: define{{.*}}s2f
-// NOSTD-NOT: llvm.bitset.test
-// NOUUID: llvm.bitset.test
+// NOBL: llvm.type.test
+// NOSTD-NOT: llvm.type.test
 void s2f(std::S2 *s2) {
   s2->f();
 }
diff --git a/test/CodeGenCXX/cfi-cast.cpp b/test/CodeGenCXX/cfi-cast.cpp
index 0b96cb6506c07..54641b52332bb 100644
--- a/test/CodeGenCXX/cfi-cast.cpp
+++ b/test/CodeGenCXX/cfi-cast.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-derived-cast -fsanitize-trap=cfi-derived-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-DCAST %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-unrelated-cast -fsanitize-trap=cfi-unrelated-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-unrelated-cast,cfi-cast-strict -fsanitize-trap=cfi-unrelated-cast,cfi-cast-strict -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST-STRICT %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-derived-cast -fsanitize-trap=cfi-derived-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-DCAST %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-unrelated-cast -fsanitize-trap=cfi-unrelated-cast -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -std=c++11 -fsanitize=cfi-unrelated-cast,cfi-cast-strict -fsanitize-trap=cfi-unrelated-cast,cfi-cast-strict -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-UCAST-STRICT %s
 
 // In this test the main thing we are searching for is something like
 // 'metadata !"1B"' where "1B" is the mangled name of the class we are
@@ -8,6 +8,7 @@
 
 struct A {
   virtual void f();
+  int i() const;
 };
 
 struct B : A {
@@ -16,9 +17,9 @@ struct B : A {
 
 struct C : A {};
 
-// CHECK-DCAST-LABEL: define void @_Z3abpP1A
+// CHECK-DCAST-LABEL: define hidden void @_Z3abpP1A
 void abp(A *a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -27,12 +28,12 @@ void abp(A *a) {
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B*>(a);
+  (void)static_cast<B*>(a);
 }
 
-// CHECK-DCAST-LABEL: define void @_Z3abrR1A
+// CHECK-DCAST-LABEL: define hidden void @_Z3abrR1A
 void abr(A &a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -41,12 +42,12 @@ void abr(A &a) {
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B&>(a);
+  (void)static_cast<B&>(a);
 }
 
-// CHECK-DCAST-LABEL: define void @_Z4abrrO1A
+// CHECK-DCAST-LABEL: define hidden void @_Z4abrrO1A
 void abrr(A &&a) {
-  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-DCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-DCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-DCAST: [[TRAPBB]]
@@ -55,12 +56,12 @@ void abrr(A &&a) {
 
   // CHECK-DCAST: [[CONTBB]]
   // CHECK-DCAST: ret
-  static_cast<B&&>(a);
+  (void)static_cast<B&&>(a);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vbpPv
+// CHECK-UCAST-LABEL: define hidden void @_Z3vbpPv
 void vbp(void *p) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -69,12 +70,12 @@ void vbp(void *p) {
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  static_cast<B*>(p);
+  (void)static_cast<B*>(p);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vbrRc
+// CHECK-UCAST-LABEL: define hidden void @_Z3vbrRc
 void vbr(char &r) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -83,12 +84,12 @@ void vbr(char &r) {
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  reinterpret_cast<B&>(r);
+  (void)reinterpret_cast<B&>(r);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z4vbrrOc
+// CHECK-UCAST-LABEL: define hidden void @_Z4vbrrOc
 void vbrr(char &&r) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   // CHECK-UCAST-NEXT: br i1 [[P]], label %[[CONTBB:[^ ]*]], label %[[TRAPBB:[^ ,]*]]
 
   // CHECK-UCAST: [[TRAPBB]]
@@ -97,29 +98,37 @@ void vbrr(char &&r) {
 
   // CHECK-UCAST: [[CONTBB]]
   // CHECK-UCAST: ret
-  reinterpret_cast<B&&>(r);
+  (void)reinterpret_cast<B&&>(r);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3vcpPv
-// CHECK-UCAST-STRICT-LABEL: define void @_Z3vcpPv
+// CHECK-UCAST-LABEL: define hidden void @_Z3vcpPv
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z3vcpPv
 void vcp(void *p) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
-  static_cast<C*>(p);
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  (void)static_cast<C*>(p);
 }
 
-// CHECK-UCAST-LABEL: define void @_Z3bcpP1B
-// CHECK-UCAST-STRICT-LABEL: define void @_Z3bcpP1B
+// CHECK-UCAST-LABEL: define hidden void @_Z3bcpP1B
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z3bcpP1B
 void bcp(B *p) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
-  (C *)p;
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  (void)(C *)p;
 }
 
-// CHECK-UCAST-LABEL: define void @_Z8bcp_callP1B
-// CHECK-UCAST-STRICT-LABEL: define void @_Z8bcp_callP1B
+// CHECK-UCAST-LABEL: define hidden void @_Z8bcp_callP1B
+// CHECK-UCAST-STRICT-LABEL: define hidden void @_Z8bcp_callP1B
 void bcp_call(B *p) {
-  // CHECK-UCAST: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-UCAST-STRICT: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  // CHECK-UCAST: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-UCAST-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
   ((C *)p)->f();
 }
+
+// CHECK-UCAST-LABEL: define hidden i32 @_Z6a_callP1A
+// CHECK-UCAST-STRICT-LABEL: define hidden i32 @_Z6a_callP1A
+int a_call(A *a) {
+  // CHECK-UCAST-NOT: @llvm.type.test
+  // CHECK-UCAST-STRICT-NOT: @llvm.type.test
+  return a->i();
+}
diff --git a/test/CodeGenCXX/cfi-cross-dso.cpp b/test/CodeGenCXX/cfi-cross-dso.cpp
index fbe6fc83a5c3a..d67927d4d2ed0 100644
--- a/test/CodeGenCXX/cfi-cross-dso.cpp
+++ b/test/CodeGenCXX/cfi-cross-dso.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall  -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS %s
 
 struct A {
   A();
@@ -30,12 +30,12 @@ void g() {
 
 // CHECK:   %[[VT:.*]] = load void (%struct.A*)**, void (%struct.A*)***
 // CHECK:   %[[VT2:.*]] = bitcast {{.*}}%[[VT]] to i8*, !nosanitize
-// ITANIUM:   %[[TEST:.*]] = call i1 @llvm.bitset.test(i8* %[[VT2]], metadata !"_ZTS1A"), !nosanitize
-// MS:   %[[TEST:.*]] = call i1 @llvm.bitset.test(i8* %[[VT2]], metadata !"?AUA@@"), !nosanitize
+// ITANIUM:   %[[TEST:.*]] = call i1 @llvm.type.test(i8* %[[VT2]], metadata !"_ZTS1A"), !nosanitize
+// MS:   %[[TEST:.*]] = call i1 @llvm.type.test(i8* %[[VT2]], metadata !"?AUA@@"), !nosanitize
 // CHECK:   br i1 %[[TEST]], label %[[CONT:.*]], label %[[SLOW:.*]], {{.*}} !nosanitize
 // CHECK: [[SLOW]]
-// ITANIUM:   call void @__cfi_slowpath(i64 7004155349499253778, i8* %[[VT2]]) {{.*}} !nosanitize
-// MS:   call void @__cfi_slowpath(i64 -8005289897957287421, i8* %[[VT2]]) {{.*}} !nosanitize
+// ITANIUM:   call void @__cfi_slowpath_diag(i64 7004155349499253778, i8* %[[VT2]], {{.*}}) {{.*}} !nosanitize
+// MS:   call void @__cfi_slowpath_diag(i64 -8005289897957287421, i8* %[[VT2]], {{.*}}) {{.*}} !nosanitize
 // CHECK:   br label %[[CONT]], !nosanitize
 // CHECK: [[CONT]]
 // CHECK:   call void %{{.*}}(%struct.A* %{{.*}})
diff --git a/test/CodeGenCXX/cfi-icall.cpp b/test/CodeGenCXX/cfi-icall.cpp
index eceb92a4421cb..c3c6ed309cc6f 100644
--- a/test/CodeGenCXX/cfi-icall.cpp
+++ b/test/CodeGenCXX/cfi-icall.cpp
@@ -15,9 +15,12 @@ void f(S *s) {
 
 void g() {
   void (*fp)(S *) = f;
-  // CHECK: call i1 @llvm.bitset.test(i8* {{.*}}, metadata ![[VOIDS:[0-9]+]])
+  // CHECK: call i1 @llvm.type.test(i8* {{.*}}, metadata [[VOIDS:![0-9]+]])
   fp(0);
 }
 
-// ITANIUM: !{![[VOIDS]], void (%"struct.(anonymous namespace)::S"*)* @_ZN12_GLOBAL__N_11fEPNS_1SE, i64 0}
-// MS: !{![[VOIDS]], void (%"struct.(anonymous namespace)::S"*)* @"\01?f@?A@@YAXPEAUS@?A@@@Z", i64 0}
+// ITANIUM: define internal void @_ZN12_GLOBAL__N_11fEPNS_1SE({{.*}} !type [[TS:![0-9]+]]
+// MS: define internal void @"\01?f@?A@@YAXPEAUS@?A@@@Z"({{.*}} !type [[TS:![0-9]+]]
+
+// CHECK: [[VOIDS]] = distinct !{}
+// CHECK: [[TS]] = !{i64 0, [[VOIDS]]}
diff --git a/test/CodeGenCXX/cfi-ms-rtti.cpp b/test/CodeGenCXX/cfi-ms-rtti.cpp
index b6e9175c865c6..fbebad4b1b8d4 100644
--- a/test/CodeGenCXX/cfi-ms-rtti.cpp
+++ b/test/CodeGenCXX/cfi-ms-rtti.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall | FileCheck --check-prefix=RTTI %s
-// RUN: %clang_cc1 -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall -fno-rtti-data | FileCheck --check-prefix=NO-RTTI %s
+// RUN: %clang_cc1 -flto -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall | FileCheck --check-prefix=RTTI %s
+// RUN: %clang_cc1 -flto -emit-llvm -o - -triple=x86_64-pc-win32 %s -fsanitize=cfi-vcall -fno-rtti-data | FileCheck --check-prefix=NO-RTTI %s
 
 struct A {
   A();
@@ -8,5 +8,5 @@ struct A {
 
 A::A() {}
 
-// RTTI: !{!"?AUA@@", [2 x i8*]* {{.*}}, i64 8}
-// NO-RTTI: !{!"?AUA@@", [1 x i8*]* {{.*}}, i64 0}
+// RTTI: !{i64 8, !"?AUA@@"}
+// NO-RTTI: !{i64 0, !"?AUA@@"}
diff --git a/test/CodeGenCXX/cfi-nvcall.cpp b/test/CodeGenCXX/cfi-nvcall.cpp
index be4d8448a2e0c..e968f05600c29 100644
--- a/test/CodeGenCXX/cfi-nvcall.cpp
+++ b/test/CodeGenCXX/cfi-nvcall.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-nvcall -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-nvcall,cfi-cast-strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-STRICT %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-nvcall -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-nvcall,cfi-cast-strict -emit-llvm -o - %s | FileCheck --check-prefix=CHECK-STRICT %s
 
 struct A {
   virtual void f();
@@ -17,8 +17,8 @@ struct C : A {
 // CHECK-LABEL: @bg
 // CHECK-STRICT-LABEL: @bg
 extern "C" void bg(B *b) {
-  // CHECK: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
-  // CHECK-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // CHECK-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
   b->g();
 }
 
@@ -29,7 +29,7 @@ extern "C" void cg(C *c) {
   // In this case C's layout is the same as its base class, so we allow
   // c to be of type A in non-strict mode.
 
-  // CHECK: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
-  // CHECK-STRICT: call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
+  // CHECK: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1A")
+  // CHECK-STRICT: call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1C")
   c->g();
 }
diff --git a/test/CodeGenCXX/cfi-speculative-vtable.cpp b/test/CodeGenCXX/cfi-speculative-vtable.cpp
new file mode 100644
index 0000000000000..490190c4afd75
--- /dev/null
+++ b/test/CodeGenCXX/cfi-speculative-vtable.cpp
@@ -0,0 +1,14 @@
+// Test that we don't emit a bit set entry for a speculative (available_externally) vtable.
+// This does not happen in the Microsoft ABI.
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck  %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -O1 -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -fsanitize-cfi-cross-dso -emit-llvm -o - %s | FileCheck  %s
+
+class A {
+ public:
+  virtual ~A();
+};
+
+A a;
+
+// CHECK: @_ZTV1A ={{.*}} available_externally
+// CHECK-NOT: !{{.*}} = !{!{{.*}}, [4 x i8*]* @_ZTV1A, i64 16}
diff --git a/test/CodeGenCXX/cfi-stats.cpp b/test/CodeGenCXX/cfi-stats.cpp
new file mode 100644
index 0000000000000..6d0dd5b27091a
--- /dev/null
+++ b/test/CodeGenCXX/cfi-stats.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall,cfi-nvcall,cfi-derived-cast,cfi-unrelated-cast,cfi-icall -fsanitize-stats -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall,cfi-nvcall,cfi-derived-cast,cfi-unrelated-cast,cfi-icall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -fsanitize-stats -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: [[STATS:@[^ ]*]] = internal global { i8*, i32, [5 x [2 x i8*]] } { i8* null, i32 5, [5 x [2 x i8*]]
+// CHECK: {{\[\[}}2 x i8*] zeroinitializer,
+// CHECK: [2 x i8*] [i8* null, i8* inttoptr (i64 2305843009213693952 to i8*)],
+// CHECK: [2 x i8*] [i8* null, i8* inttoptr (i64 4611686018427387904 to i8*)],
+// CHECK: [2 x i8*] [i8* null, i8* inttoptr (i64 6917529027641081856 to i8*)],
+// CHECK: [2 x i8*] [i8* null, i8* inttoptr (i64 -9223372036854775808 to i8*)]] }
+
+// CHECK: @llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* [[CTOR:@[^ ]*]], i8* null }]
+
+struct A {
+  virtual void vf();
+  void nvf();
+};
+struct B : A {};
+
+// CHECK: @vcall
+extern "C" void vcall(A *a) {
+  // CHECK: call void @__sanitizer_stat_report({{.*}}[[STATS]]{{.*}}i64 0, i32 2, i64 0
+  a->vf();
+}
+
+// CHECK: @nvcall
+extern "C" void nvcall(A *a) {
+  // CHECK: call void @__sanitizer_stat_report({{.*}}[[STATS]]{{.*}}i64 0, i32 2, i64 1
+  a->nvf();
+}
+
+// CHECK: @dcast
+extern "C" void dcast(A *a) {
+  // CHECK: call void @__sanitizer_stat_report({{.*}}[[STATS]]{{.*}}i64 0, i32 2, i64 2
+  static_cast<B *>(a);
+}
+
+// CHECK: @ucast
+extern "C" void ucast(void *a) {
+  // CHECK: call void @__sanitizer_stat_report({{.*}}[[STATS]]{{.*}}i64 0, i32 2, i64 3
+  reinterpret_cast<A *>(a);
+}
+
+// CHECK: @icall
+extern "C" void icall(void (*p)()) {
+  // CHECK: call void @__sanitizer_stat_report({{.*}}[[STATS]]{{.*}}i64 0, i32 2, i64 4
+  p();
+}
+
+// CHECK: define internal void [[CTOR]]()
+// CHECK-NEXT: call void @__sanitizer_stat_init(i8* bitcast ({ i8*, i32, [5 x [2 x i8*]] }* [[STATS]] to i8*))
+// CHECK-NEXT: ret void
diff --git a/test/CodeGenCXX/cfi-vcall.cpp b/test/CodeGenCXX/cfi-vcall.cpp
deleted file mode 100644
index daa0531e85d6d..0000000000000
--- a/test/CodeGenCXX/cfi-vcall.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=NDIAG %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=DIAG --check-prefix=DIAG-ABORT %s
-// RUN: %clang_cc1 -triple x86_64-unknown-linux -fsanitize=cfi-vcall -fsanitize-recover=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=ITANIUM --check-prefix=DIAG --check-prefix=DIAG-RECOVER %s
-// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CHECK --check-prefix=MS --check-prefix=NDIAG %s
-
-// MS: @[[VTA:[0-9]*]] {{.*}} comdat($"\01??_7A@@6B@")
-// MS: @[[VTB:[0-9]*]] {{.*}} comdat($"\01??_7B@@6B0@@")
-// MS: @[[VTAinB:[0-9]*]] {{.*}} comdat($"\01??_7B@@6BA@@@")
-// MS: @[[VTAinC:[0-9]*]] {{.*}} comdat($"\01??_7C@@6B@")
-// MS: @[[VTBinD:[0-9]*]] {{.*}} comdat($"\01??_7D@?A@@6BB@@@")
-// MS: @[[VTAinBinD:[0-9]*]] {{.*}} comdat($"\01??_7D@?A@@6BA@@@")
-// MS: @[[VTFA:[0-9]*]] {{.*}} comdat($"\01??_7FA@?1??foo@@YAXXZ@6B@")
-
-struct A {
-  A();
-  virtual void f();
-};
-
-struct B : virtual A {
-  B();
-  virtual void g();
-  virtual void h();
-};
-
-struct C : virtual A {
-  C();
-};
-
-namespace {
-
-struct D : B, C {
-  D();
-  virtual void f();
-  virtual void h();
-};
-
-}
-
-A::A() {}
-B::B() {}
-C::C() {}
-D::D() {}
-
-void A::f() {
-}
-
-void B::g() {
-}
-
-void D::f() {
-}
-
-void D::h() {
-}
-
-// DIAG: @[[SRC:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}cfi-vcall.cpp\00", align 1
-// DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [4 x i8] } { i16 -1, i16 0, [4 x i8] c"'A'\00" }
-// DIAG: @[[BADTYPESTATIC:.*]] = private unnamed_addr global { { [{{.*}} x i8]*, i32, i32 }, { i16, i16, [4 x i8] }*, i8 } { { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 [[@LINE+21]], i32 3 }, { i16, i16, [4 x i8] }* @[[TYPE]], i8 0 }
-
-// ITANIUM: define void @_Z2afP1A
-// MS: define void @"\01?af@@YAXPEAUA@@@Z"
-void af(A *a) {
-  // ITANIUM: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* [[VT:%[^ ]*]], metadata !"_ZTS1A")
-  // MS: [[P:%[^ ]*]] = call i1 @llvm.bitset.test(i8* [[VT:%[^ ]*]], metadata !"?AUA@@")
-  // CHECK-NEXT: br i1 [[P]], label %[[CONTBB:[^ ,]*]], label %[[TRAPBB:[^ ,]*]]
-  // CHECK-NEXT: {{^$}}
-
-  // CHECK: [[TRAPBB]]
-  // NDIAG-NEXT: call void @llvm.trap()
-  // NDIAG-NEXT: unreachable
-  // DIAG-NEXT: [[VTINT:%[^ ]*]] = ptrtoint i8* [[VT]] to i64
-  // DIAG-ABORT-NEXT: call void @__ubsan_handle_cfi_bad_type_abort(i8* bitcast ({{.*}} @[[BADTYPESTATIC]] to i8*), i64 [[VTINT]])
-  // DIAG-ABORT-NEXT: unreachable
-  // DIAG-RECOVER-NEXT: call void @__ubsan_handle_cfi_bad_type(i8* bitcast ({{.*}} @[[BADTYPESTATIC]] to i8*), i64 [[VTINT]])
-  // DIAG-RECOVER-NEXT: br label %[[CONTBB]]
-
-  // CHECK: [[CONTBB]]
-  // CHECK: call void %
-  a->f();
-}
-
-// ITANIUM: define internal void @_Z3df1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df1@@YAXPEAUD@?A@@@Z"
-void df1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUA@@")
-  d->f();
-}
-
-// ITANIUM: define internal void @_Z3dg1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?dg1@@YAXPEAUD@?A@@@Z"
-void dg1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUB@@")
-  d->g();
-}
-
-// ITANIUM: define internal void @_Z3dh1PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?dh1@@YAXPEAUD@?A@@@Z"
-void dh1(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE]])
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
-  d->h();
-}
-
-// ITANIUM: define internal void @_Z3df2PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df2@@YAXPEAUD@?A@@@Z"
-__attribute__((no_sanitize("cfi")))
-void df2(D *d) {
-  // CHECK-NOT: call i1 @llvm.bitset.test
-  d->f();
-}
-
-// ITANIUM: define internal void @_Z3df3PN12_GLOBAL__N_11DE
-// MS: define internal void @"\01?df3@@YAXPEAUD@?A@@@Z"
-__attribute__((no_sanitize("address"))) __attribute__((no_sanitize("cfi-vcall")))
-void df3(D *d) {
-  // CHECK-NOT: call i1 @llvm.bitset.test
-  d->f();
-}
-
-D d;
-
-void foo() {
-  df1(&d);
-  dg1(&d);
-  dh1(&d);
-  df2(&d);
-  df3(&d);
-
-  struct FA : A {
-    void f() {}
-  } fa;
-  af(&fa);
-}
-
-namespace test2 {
-
-struct A {
-  virtual void m_fn1();
-};
-struct B {
-  virtual void m_fn2();
-};
-struct C : B, A {};
-struct D : C {
-  void m_fn1();
-};
-
-// ITANIUM: define void @_ZN5test21fEPNS_1DE
-// MS: define void @"\01?f@test2@@YAXPEAUD@1@@Z"
-void f(D *d) {
-  // ITANIUM: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"_ZTSN5test21DE")
-  // MS: {{%[^ ]*}} = call i1 @llvm.bitset.test(i8* {{%[^ ]*}}, metadata !"?AUA@test2@@")
-  d->m_fn1();
-}
-
-}
-
-// Check for the expected number of elements (9 or 15 respectively).
-// MS: !llvm.bitsets = !{[[X:[^,]*(,[^,]*){8}]]}
-// ITANIUM: !llvm.bitsets = !{[[X:[^,]*(,[^,]*){14}]]}
-
-// ITANIUM-DAG: !{!"_ZTS1A", [3 x i8*]* @_ZTV1A, i64 16}
-// ITANIUM-DAG: !{!"_ZTS1A", [7 x i8*]* @_ZTCN12_GLOBAL__N_11DE0_1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [7 x i8*]* @_ZTCN12_GLOBAL__N_11DE0_1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [9 x i8*]* @_ZTCN12_GLOBAL__N_11DE8_1C, i64 64}
-// ITANIUM-DAG: !{!"_ZTS1C", [9 x i8*]* @_ZTCN12_GLOBAL__N_11DE8_1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1C", [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 88}
-// ITANIUM-DAG: !{![[DTYPE]], [12 x i8*]* @_ZTVN12_GLOBAL__N_11DE, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [7 x i8*]* @_ZTV1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1B", [7 x i8*]* @_ZTV1B, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [5 x i8*]* @_ZTV1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1C", [5 x i8*]* @_ZTV1C, i64 32}
-// ITANIUM-DAG: !{!"_ZTS1A", [3 x i8*]* @_ZTVZ3foovE2FA, i64 16}
-// ITANIUM-DAG: !{!{{[0-9]+}}, [3 x i8*]* @_ZTVZ3foovE2FA, i64 16}
-
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTA]], i64 8}
-// MS-DAG: !{!"?AUB@@", [3 x i8*]* @[[VTB]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinB]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinC]], i64 8}
-// MS-DAG: !{!"?AUB@@", [3 x i8*]* @[[VTBinD]], i64 8}
-// MS-DAG: !{![[DTYPE]], [3 x i8*]* @[[VTBinD]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTAinBinD]], i64 8}
-// MS-DAG: !{!"?AUA@@", [2 x i8*]* @[[VTFA]], i64 8}
-// MS-DAG: !{!{{[0-9]+}}, [2 x i8*]* @[[VTFA]], i64 8}
diff --git a/test/CodeGenCXX/const-init-cxx11.cpp b/test/CodeGenCXX/const-init-cxx11.cpp
index 99be265e21267..0c2193fb07f21 100644
--- a/test/CodeGenCXX/const-init-cxx11.cpp
+++ b/test/CodeGenCXX/const-init-cxx11.cpp
@@ -343,13 +343,13 @@ namespace VirtualMembers {
     constexpr E() : B(3), c{'b','y','e'} {}
     char c[3];
   };
-  // CHECK: @_ZN14VirtualMembers1eE = global { i8**, double, i32, i8**, double, [5 x i8], i16, i8**, double, [5 x i8], [3 x i8] } { i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 2), double 1.000000e+00, i32 64, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 5), double 2.000000e+00, [5 x i8] c"hello", i16 5, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i64 0, i64 9), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
+  // CHECK: @_ZN14VirtualMembers1eE = global { i8**, double, i32, i8**, double, [5 x i8], i16, i8**, double, [5 x i8], [3 x i8] } { i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 2), double 1.000000e+00, i32 64, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 5), double 2.000000e+00, [5 x i8] c"hello", i16 5, i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTVN14VirtualMembers1EE, i32 0, i32 9), double 3.000000e+00, [5 x i8] c"world", [3 x i8] c"bye" }
   E e;
 
   struct nsMemoryImpl {
     virtual void f();
   };
-  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global { i8** } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers12nsMemoryImplE, i64 0, i64 2) }
+  // CHECK: @_ZN14VirtualMembersL13sGlobalMemoryE = internal global { i8** } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers12nsMemoryImplE, i32 0, i32 2) }
   __attribute__((used))
   static nsMemoryImpl sGlobalMemory;
 
@@ -360,7 +360,7 @@ namespace VirtualMembers {
 
     T t;
   };
-  // CHECK: @_ZN14VirtualMembers1tE = global { i8**, i32 } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers13TemplateClassIiEE, i64 0, i64 2), i32 42 }
+  // CHECK: @_ZN14VirtualMembers1tE = global { i8**, i32 } { i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN14VirtualMembers13TemplateClassIiEE, i32 0, i32 2), i32 42 }
   TemplateClass<int> t;
 }
 
diff --git a/test/CodeGenCXX/const-init.cpp b/test/CodeGenCXX/const-init.cpp
index deb923a708905..f5c9dae7ba4b6 100644
--- a/test/CodeGenCXX/const-init.cpp
+++ b/test/CodeGenCXX/const-init.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -std=c++98 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -std=c++11 -o - %s | FileCheck %s
 
 // CHECK: @a = global i32 10
 int a = 10;
@@ -27,8 +29,13 @@ C g0 = { C::e1 };
 
 namespace test2 {
   struct A {
+#if __cplusplus <= 199711L
     static const double d = 1.0;
     static const float f = d / 2;
+#else
+    static constexpr double d = 1.0;
+    static constexpr float f = d / 2;
+#endif
     static int g();
   } a;
 
diff --git a/test/CodeGenCXX/constructor-init.cpp b/test/CodeGenCXX/constructor-init.cpp
index d7ae220712f8a..c78534a218de9 100644
--- a/test/CodeGenCXX/constructor-init.cpp
+++ b/test/CodeGenCXX/constructor-init.cpp
@@ -95,14 +95,14 @@ namespace InitVTable {
 
   // CHECK-LABEL: define void @_ZN10InitVTable1BC2Ev(%"struct.InitVTable::B"* %this) unnamed_addr
   // CHECK:      [[T0:%.*]] = bitcast [[B:%.*]]* [[THIS:%.*]] to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK:      [[VTBL:%.*]] = load i32 ([[B]]*)**, i32 ([[B]]*)*** {{%.*}}
   // CHECK-NEXT: [[FNP:%.*]] = getelementptr inbounds i32 ([[B]]*)*, i32 ([[B]]*)** [[VTBL]], i64 0
   // CHECK-NEXT: [[FN:%.*]] = load i32 ([[B]]*)*, i32 ([[B]]*)** [[FNP]]
   // CHECK-NEXT: [[ARG:%.*]] = call i32 [[FN]]([[B]]* [[THIS]])
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei({{.*}}* {{%.*}}, i32 [[ARG]])
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[B]]* [[THIS]] to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK-NEXT: ret void
   B::B() : A(foo()) {}
 
@@ -110,7 +110,7 @@ namespace InitVTable {
   // CHECK:      [[ARG:%.*]] = add nsw i32 {{%.*}}, 5
   // CHECK-NEXT: call void @_ZN10InitVTable1AC2Ei({{.*}}* {{%.*}}, i32 [[ARG]])
   // CHECK-NEXT: [[T0:%.*]] = bitcast [[B]]* {{%.*}} to i32 (...)***
-  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+  // CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN10InitVTable1BE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
   // CHECK-NEXT: ret void
   B::B(int x) : A(x + 5) {}
 }
diff --git a/test/CodeGenCXX/copy-constructor-elim.cpp b/test/CodeGenCXX/copy-constructor-elim.cpp
index d9b28ce30e917..4abe456e4b29d 100644
--- a/test/CodeGenCXX/copy-constructor-elim.cpp
+++ b/test/CodeGenCXX/copy-constructor-elim.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple %ms_abi_triple -emit-llvm -o - %s | FileCheck %s -check-prefix MS
 // CHECK-NOT: _ZN1CC1ERK1C
 // CHECK-NOT: _ZN1SC1ERK1S
diff --git a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
index 02feed3208f6e..9790ca882c6b0 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis-2.cpp
@@ -24,4 +24,4 @@ struct A { virtual void a(); };
 A x(A& y) { return y; }
 
 // CHECK: define linkonce_odr {{.*}} @_ZN1AC1ERKS_(%struct.A* {{.*}}%this, %struct.A* dereferenceable({{[0-9]+}})) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
diff --git a/test/CodeGenCXX/copy-constructor-synthesis.cpp b/test/CodeGenCXX/copy-constructor-synthesis.cpp
index 2f0aa3b3a648b..4928c61488e9d 100644
--- a/test/CodeGenCXX/copy-constructor-synthesis.cpp
+++ b/test/CodeGenCXX/copy-constructor-synthesis.cpp
@@ -166,7 +166,7 @@ void f(B b1) {
 // CHECK-LABEL:    define linkonce_odr void @_ZN12rdar138169401AC2ERKS0_(
 // CHECK:      [[THIS:%.*]] = load [[A]]*, [[A]]**
 // CHECK-NEXT: [[T0:%.*]] = bitcast [[A]]* [[THIS]] to i32 (...)***
-// CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN12rdar138169401AE, i64 0, i64 2) to i32 (...)**), i32 (...)*** [[T0]]
+// CHECK-NEXT: store i32 (...)** bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN12rdar138169401AE, i32 0, i32 2) to i32 (...)**), i32 (...)*** [[T0]]
 // CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds [[A]], [[A]]* [[THIS]], i32 0, i32 1
 // CHECK-NEXT: [[OTHER:%.*]] = load [[A]]*, [[A]]**
 // CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds [[A]], [[A]]* [[OTHER]], i32 0, i32 1
diff --git a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
index 311edaabb5db9..7bab11488ad42 100644
--- a/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
+++ b/test/CodeGenCXX/cxx0x-initializer-stdinitializerlist.cpp
@@ -249,7 +249,7 @@ haslist2::haslist2()
 void fn10(int i) {
   // CHECK-LABEL: define void @_Z4fn10i
   // CHECK: alloca [3 x i32]
-  // CHECK: call noalias i8* @_Znw{{[jm]}}
+  // CHECK: call i8* @_Znw{{[jm]}}
   // CHECK: store i32 %
   // CHECK: store i32 2
   // CHECK: store i32 3
diff --git a/test/CodeGenCXX/cxx11-exception-spec.cpp b/test/CodeGenCXX/cxx11-exception-spec.cpp
index a3dff79fc4946..6a3a394e047a0 100644
--- a/test/CodeGenCXX/cxx11-exception-spec.cpp
+++ b/test/CodeGenCXX/cxx11-exception-spec.cpp
@@ -70,37 +70,37 @@ void h() {
 
 // CHECK: define {{.*}} @_Z1iv
 void i() {
-  // CHECK: declare {{.*}} @_Z1gIiEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIiEvv() [[NUW2:#[0-9]+]]
   g<int>();
   // CHECK: declare {{.*}} @_Z1gIA2_iEvv()
   // CHECK-NOT: [[NUW]]
   g<int[2]>();
 
-  // CHECK: declare {{.*}} @_ZN1SIiE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIiE1gEv() [[NUW2]]
   S<int>::g();
   // CHECK: declare {{.*}} @_ZN1SIA2_iE1gEv()
   // CHECK-NOT: [[NUW]]
   S<int[2]>::g();
 
-  // CHECK: declare {{.*}} @_Z1gIfEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIfEvv() [[NUW2]]
   void (*g1)() = &g<float>;
   // CHECK: declare {{.*}} @_Z1gIdEvv()
   // CHECK-NOT: [[NUW]]
   void (*g2)() = &g<double>;
 
-  // CHECK: declare {{.*}} @_ZN1SIfE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIfE1gEv() [[NUW2]]
   void (*g3)() = &S<float>::g;
   // CHECK: declare {{.*}} @_ZN1SIdE1gEv()
   // CHECK-NOT: [[NUW]]
   void (*g4)() = &S<double>::g;
 
-  // CHECK: declare {{.*}} @_Z1gIA4_cEvv() [[NUW]]
+  // CHECK: declare {{.*}} @_Z1gIA4_cEvv() [[NUW2]]
   (void)&g<char[4]>;
   // CHECK: declare {{.*}} @_Z1gIcEvv()
   // CHECK-NOT: [[NUW]]
   (void)&g<char>;
 
-  // CHECK: declare {{.*}} @_ZN1SIA4_cE1gEv() [[NUW]]
+  // CHECK: declare {{.*}} @_ZN1SIA4_cE1gEv() [[NUW2]]
   (void)&S<char[4]>::g;
   // CHECK: declare {{.*}} @_ZN1SIcE1gEv()
   // CHECK-NOT: [[NUW]]
@@ -116,12 +116,15 @@ void j() {
   // CHECK: declare {{.*}} @_ZN6NestedIiE1fILb1EcEEvv(
   // CHECK-NOT: [[NUW]]
   Nested<int>().f<true, char>();
-  // CHECK: declare {{.*}} @_ZN6NestedIlE1fILb0ElEEvv({{.*}}) [[NUW]]
+  // CHECK: declare {{.*}} @_ZN6NestedIlE1fILb0ElEEvv({{.*}}) [[NUW2]]
   Nested<long>().f<false, long>();
 }
 
 // CHECK: attributes [[NONE]] = { {{.*}} }
 // CHECK: attributes [[NUW]] = { nounwind{{.*}} }
+// CHECK: attributes [[NUW2]] = { nounwind{{.*}} }
+
+
 
 namespace PR19190 {
 template <class T> struct DWFIterator { virtual void get() throw(int) = 0; };
diff --git a/test/CodeGenCXX/cxx11-initializer-array-new.cpp b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
index c662190ff386b..59f96031fc40a 100644
--- a/test/CodeGenCXX/cxx11-initializer-array-new.cpp
+++ b/test/CodeGenCXX/cxx11-initializer-array-new.cpp
@@ -7,7 +7,7 @@ struct S { S(); S(int); ~S(); int n; };
 void *p = new S[2][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 
 // CHECK-LABEL: define
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 32)
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 32)
 // CHECK: %[[COOKIE:.*]] = bitcast i8* %[[ALLOC]] to i64*
 // CHECK: store i64 6, i64* %[[COOKIE]]
 // CHECK: %[[START_AS_i8:.*]] = getelementptr inbounds i8, i8* %[[ALLOC]], i64 8
@@ -50,7 +50,7 @@ void *q = new S[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 // CHECK: call {{.*}} @llvm.umul.with.overflow.i64(i64 %[[N:.*]], i64 12)
 // CHECK: %[[ELTS:.*]] = mul i64 %[[N]], 3
 // CHECK: call {{.*}} @llvm.uadd.with.overflow.i64(i64 %{{.*}}, i64 8)
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 %{{.*}})
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 %{{.*}})
 //
 // CHECK: %[[COOKIE:.*]] = bitcast i8* %[[ALLOC]] to i64*
 // CHECK: store i64 %[[ELTS]], i64* %[[COOKIE]]
@@ -113,7 +113,7 @@ void *r = new T[n][3]{ { 1, 2, 3 }, { 4, 5, 6 } };
 // No cookie.
 // CHECK-NOT: @llvm.uadd.with.overflow
 //
-// CHECK: %[[ALLOC:.*]] = call noalias i8* @_Znam(i64 %{{.*}})
+// CHECK: %[[ALLOC:.*]] = call i8* @_Znam(i64 %{{.*}})
 //
 // CHECK: %[[START_AS_T:.*]] = bitcast i8* %[[ALLOC]] to %[[T:.*]]*
 //
diff --git a/test/CodeGenCXX/cxx11-thread-local-reference.cpp b/test/CodeGenCXX/cxx11-thread-local-reference.cpp
index 8b2ac5eed800b..4c1e5a70cbaff 100644
--- a/test/CodeGenCXX/cxx11-thread-local-reference.cpp
+++ b/test/CodeGenCXX/cxx11-thread-local-reference.cpp
@@ -21,13 +21,16 @@ int &g() { return r; }
 // DARWIN: call cxx_fast_tlscc i32* @_ZTW1r()
 // CHECK: ret i32* %{{.*}}
 
-// LINUX: define weak_odr hidden i32* @_ZTW1r() {
-// DARWIN: define cxx_fast_tlscc i32* @_ZTW1r() [[ATTR:#[0-9]+]] {
-// CHECK: call void @_ZTH1r()
+// LINUX: define weak_odr hidden i32* @_ZTW1r() [[ATTR0:#[0-9]+]] {
+// DARWIN: define cxx_fast_tlscc i32* @_ZTW1r() [[ATTR1:#[0-9]+]] {
+// LINUX: call void @_ZTH1r()
+// DARWIN: call cxx_fast_tlscc void @_ZTH1r()
 // CHECK: load i32*, i32** @r, align 8
 // CHECK: ret i32* %{{.*}}
 
-// CHECK-LABEL: define internal void @__tls_init()
+// LINUX-LABEL: define internal void @__tls_init()
+// DARWIN-LABEL: define internal cxx_fast_tlscc void @__tls_init()
 // CHECK: call void @[[R_INIT]]()
 
-// DARWIN: attributes [[ATTR]] = { nounwind }
+// LINUX: attributes [[ATTR0]] = { {{.*}}"target-features"{{.*}} }
+// DARWIN: attributes [[ATTR1]] = { {{.*}}nounwind{{.*}}"target-features"{{.*}}  }
diff --git a/test/CodeGenCXX/cxx11-thread-local.cpp b/test/CodeGenCXX/cxx11-thread-local.cpp
index b5bcc5e23ecd0..f465cbdeea847 100644
--- a/test/CodeGenCXX/cxx11-thread-local.cpp
+++ b/test/CodeGenCXX/cxx11-thread-local.cpp
@@ -122,7 +122,8 @@ int f() {
 
 // LINUX-LABEL: define weak_odr hidden i32* @_ZTWN1VIiE1mE()
 // DARWIN-LABEL: define weak_odr hidden cxx_fast_tlscc i32* @_ZTWN1VIiE1mE()
-// CHECK: call void @_ZTHN1VIiE1mE()
+// LINUX: call void @_ZTHN1VIiE1mE()
+// DARWIN: call cxx_fast_tlscc void @_ZTHN1VIiE1mE()
 // CHECK: ret i32* @_ZN1VIiE1mE
 
 
@@ -212,20 +213,25 @@ void set_anon_i() {
 
 // LIUNX: define weak_odr hidden i32* @_ZTW1a() {
 // DARWIN: define cxx_fast_tlscc i32* @_ZTW1a()
-// CHECK:   call void @_ZTH1a()
+// LINUX:   call void @_ZTH1a()
+// DARWIN: call cxx_fast_tlscc void @_ZTH1a()
 // CHECK:   ret i32* @a
 // CHECK: }
 
 
-// LINUX: declare extern_weak void @_ZTH1b()
+// LINUX: declare extern_weak void @_ZTH1b() [[ATTR:#[0-9]+]]
 
 
 // LINUX-LABEL: define internal i32* @_ZTWL1d()
 // DARWIN-LABEL: define internal cxx_fast_tlscc i32* @_ZTWL1d()
-// CHECK: call void @_ZTHL1d()
+// LINUX: call void @_ZTHL1d()
+// DARWIN: call cxx_fast_tlscc void @_ZTHL1d()
 // CHECK: ret i32* @_ZL1d
 
 // LINUX-LABEL: define weak_odr hidden i32* @_ZTWN1U1mE()
 // DARWIN-LABEL: define cxx_fast_tlscc i32* @_ZTWN1U1mE()
-// CHECK: call void @_ZTHN1U1mE()
+// LINUX: call void @_ZTHN1U1mE()
+// DARWIN: call cxx_fast_tlscc void @_ZTHN1U1mE()
 // CHECK: ret i32* @_ZN1U1mE
+
+// LINUX: attributes [[ATTR]] = { {{.+}} }
diff --git a/test/CodeGenCXX/cxx1z-constexpr-if.cpp b/test/CodeGenCXX/cxx1z-constexpr-if.cpp
new file mode 100644
index 0000000000000..80a397f51e9a9
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-constexpr-if.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -std=c++1z %s -emit-llvm -o - | FileCheck %s --implicit-check-not=should_not_be_used
+
+void should_be_used_1();
+void should_be_used_2();
+void should_not_be_used();
+void f() {
+  if constexpr (false)
+    should_not_be_used();
+  else
+    should_be_used_1();
+
+  if constexpr (true || ({ label: false; }))
+    should_be_used_2();
+  else {
+    goto foo;
+foo: should_not_be_used();
+  }
+}
+
+// CHECK: should_be_used_1
+// CHECK: should_be_used_2
diff --git a/test/CodeGenCXX/cxx1z-init-statement.cpp b/test/CodeGenCXX/cxx1z-init-statement.cpp
new file mode 100644
index 0000000000000..5c05212c72470
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-init-statement.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -std=c++1z -triple x86_64-apple-macosx10.7.0 -emit-llvm -o - %s -w | FileCheck %s
+
+typedef int T;
+void f() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[B:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT  %[[C:.*]] = icmp slt i32 %[[B]], 8
+  if (int a = 5; a < 8)
+    ;
+}
+
+void f1() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[C:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[B]], align 4
+  // CHECK-NEXT: store i32 7, i32* %[[C]], align 4
+  if (int a, b = 5; int c = 7)
+    ;
+}
+
+int f2() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = call i32 @_Z2f2v()
+  // CHECK-NEXT: store i32 7, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[C:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[D:.*]] = icmp ne i32 %[[C]], 0
+  if (T{f2()}; int c = 7)
+    ;
+  return 2;
+}
+
+void g() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[B:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: switch i32 %[[B]], label %[[C:.*]] [
+  switch (int a = 5; a) {
+    case 0:
+      break;
+  }
+}
+
+void g1() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[C:.*]] = alloca i32, align 4
+  // CHECK-NEXT: store i32 5, i32* %[[B]], align 4
+  // CHECK-NEXT: store i32 7, i32* %[[C]], align 4
+  // CHECK-NEXT: %[[D:.*]] = load i32, i32* %[[C]], align 4
+  // CHECK-NEXT: switch i32 %[[D]], label %[[E:.*]] [
+  switch (int a, b = 5; int c = 7) {
+    case 0:
+      break;
+  }
+}
+
+int g2() {
+  // CHECK:      %[[A:.*]] = alloca i32, align 4
+  // CHECK-NEXT: %[[B:.*]] = call i32 @_Z2f2v()
+  // CHECK-NEXT: store i32 7, i32* %[[A]], align 4
+  // CHECK-NEXT: %[[C:.*]] = load i32, i32* %[[A]], align 4
+  // CHECK-NEXT: switch i32 %[[C]], label %[[E:.*]] [
+  switch (T{f2()}; int c = 7) {
+    case 0:
+      break;
+  }
+  return 2;
+}
diff --git a/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
new file mode 100644
index 0000000000000..9110e49f93a1a
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-initializer-aggregate.cpp
@@ -0,0 +1,114 @@
+// RUN: %clang_cc1 -std=c++1z %s -triple x86_64-linux-gnu -fexceptions -fcxx-exceptions -emit-llvm -o - | FileCheck %s
+
+namespace Constant {
+  struct A {
+    int n;
+    char k;
+    ~A();
+  };
+
+  struct B {
+    char k2;
+  };
+
+  struct C : B {};
+
+  struct D : A, C {};
+
+  C c1 = {};
+  C c2 = {1};
+  // CHECK: @_ZN8Constant2c1E = global { i8 } zeroinitializer, align 1
+  // CHECK: @_ZN8Constant2c2E = global { i8 } { i8 1 }, align 1
+
+  // Test packing bases into tail padding.
+  D d1 = {};
+  D d2 = {1, 2, 3};
+  D d3 = {1};
+  // CHECK: @_ZN8Constant2d1E = global { i32, i8, i8 } zeroinitializer, align 4
+  // CHECK: @_ZN8Constant2d2E = global { i32, i8, i8 } { i32 1, i8 2, i8 3 }, align 4
+  // CHECK: @_ZN8Constant2d3E = global { i32, i8, i8 } { i32 1, i8 0, i8 0 }, align 4
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d1E
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d2E
+
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN8Constant1DD1Ev {{.*}} @_ZN8Constant2d3E
+}
+
+namespace Dynamic {
+  struct A {
+    A();
+    A(int);
+    A(const char*, unsigned);
+    ~A();
+    void *p;
+  };
+
+  struct B {
+    ~B();
+    int n = 5;
+  };
+
+  struct C {
+    C(bool = true);
+  };
+
+  int f(), g(), h(), i();
+  struct D : A, B, C {
+    int n = f();
+  };
+
+  D d1 = {};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call void @_ZN7Dynamic1AC2Ev({{.*}} @_ZN7Dynamic2d1E
+  // CHECK: store i32 5, {{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d1E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC2Eb({{.*}} @_ZN7Dynamic2d1E{{.*}}, i1 zeroext true)
+  // CHECK:   unwind label %[[UNWIND:.*]]
+  // CHECK: invoke i32 @_ZN7Dynamic1fEv()
+  // CHECK:   unwind label %[[UNWIND:.*]]
+  // CHECK: store i32 {{.*}}, i32* getelementptr {{.*}} @_ZN7Dynamic2d1E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d1E
+  // CHECK: ret
+  //
+  //   UNWIND:
+  // CHECK: call void @_ZN7Dynamic1BD1Ev({{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d1E{{.*}}, i64 8
+  // CHECK: call void @_ZN7Dynamic1AD1Ev({{.*}} @_ZN7Dynamic2d1E
+
+  D d2 = {1, 2, false};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: call void @_ZN7Dynamic1AC1Ei({{.*}} @_ZN7Dynamic2d2E{{.*}}, i32 1)
+  // CHECK: store i32 2, {{.*}}i8* getelementptr inbounds {{.*}}@_ZN7Dynamic2d2E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC1Eb({{.*}} @_ZN7Dynamic2d2E{{.*}}, i1 zeroext false)
+  // CHECK: invoke i32 @_ZN7Dynamic1fEv()
+  // CHECK: store i32 {{.*}}, i32* getelementptr {{.*}} @_ZN7Dynamic2d2E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d2E
+  // CHECK: ret void
+
+  D d3 = {g(), h(), {}, i()};
+  // CHECK-LABEL: define {{.*}}global_var_init
+  // CHECK: %[[G_CALL:.*]] = call i32 @_ZN7Dynamic1gEv()
+  // CHECK: call void @_ZN7Dynamic1AC1Ei({{.*}} @_ZN7Dynamic2d3E{{.*}}, i32 %[[G_CALL]])
+  // CHECK: %[[H_CALL:.*]] = invoke i32 @_ZN7Dynamic1hEv()
+  // CHECK:   unwind label %[[DESTROY_A_LPAD:.*]]
+  // CHECK: store i32 %[[H_CALL]], {{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d3E{{.*}}, i64 8
+  // CHECK: invoke void @_ZN7Dynamic1CC2Eb({{.*}} @_ZN7Dynamic2d3E{{.*}}, i1 zeroext true)
+  // CHECK:   unwind label %[[DESTROY_AB_LPAD:.*]]
+  // CHECK: %[[I_CALL:.*]] = invoke i32 @_ZN7Dynamic1iEv()
+  // CHECK:   unwind label %[[DESTROY_AB_LPAD:.*]]
+  // CHECK: store i32 %[[I_CALL]], i32* getelementptr {{.*}} @_ZN7Dynamic2d3E, i32 0, i32 2
+  // CHECK: call {{.*}} @__cxa_atexit({{.*}} @_ZN7Dynamic1DD1Ev {{.*}} @_ZN7Dynamic2d3E to i8*
+  // CHECK: ret
+  //
+  //   DESTROY_A_LPAD:
+  // CHECK: br label %[[A_CLEANUP:.*]]
+  //
+  //   DESTROY_B_LPAD:
+  // CHECK: call void @_ZN7Dynamic1BD1Ev({{.*}}i8* getelementptr inbounds {{.*}} @_ZN7Dynamic2d3E{{.*}}, i64 8
+  // CHECK: br label %[[A_CLEANUP:.*]]
+  //
+  //   A_CLEANUP:
+  // CHECK: call void @_ZN7Dynamic1AD1Ev({{.*}} @_ZN7Dynamic2d3E
+}
diff --git a/test/CodeGenCXX/cxx1z-inline-variables.cpp b/test/CodeGenCXX/cxx1z-inline-variables.cpp
new file mode 100644
index 0000000000000..183709373d12a
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-inline-variables.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -std=c++1z %s -emit-llvm -o - -triple x86_64-linux-gnu | FileCheck %s
+
+struct Q {
+  // CHECK: @_ZN1Q1kE = linkonce_odr constant i32 5, comdat
+  static constexpr int k = 5;
+};
+const int &r = Q::k;
+
+int f();
+
+// const does not imply internal linkage.
+// CHECK: @external_inline = linkonce_odr constant i32 5, comdat
+inline const int external_inline = 5;
+const int &use1 = external_inline;
+
+// static still does, though.
+// CHECK: @_ZL15internal_inline = internal constant i32 5
+static inline const int internal_inline = 5;
+const int &use2 = internal_inline;
+
+int a = f();
+// CHECK: @b = linkonce_odr global i32 0, comdat
+// CHECK: @_ZGV1b = linkonce_odr global i64 0, comdat($b)
+inline int b = f();
+int c = f();
+
+// For compatibility with C++11 and C++14, an out-of-line declaration of a
+// static constexpr local variable promotes the variable to weak_odr.
+struct compat {
+  static constexpr int a = 1;
+  static constexpr int b = 2;
+  static constexpr int c = 3;
+  static inline constexpr int d = 4;
+};
+const int &compat_use_before_redecl = compat::b;
+const int compat::a;
+const int compat::b;
+const int compat::c;
+const int compat::d;
+const int &compat_use_after_redecl1 = compat::c;
+const int &compat_use_after_redecl2 = compat::d;
+// CHECK: @_ZN6compat1bE = weak_odr constant i32 2
+// CHECK: @_ZN6compat1aE = weak_odr constant i32 1
+// CHECK: @_ZN6compat1cE = weak_odr constant i32 3
+// CHECK: @_ZN6compat1dE = linkonce_odr constant i32 4
+
+template<typename T> struct X {
+  static int a;
+  static inline int b;
+  static int c;
+};
+// CHECK: @_ZN1XIiE1aE = linkonce_odr global i32 10
+// CHECK: @_ZN1XIiE1bE = global i32 20
+// CHECK-NOT: @_ZN1XIiE1cE
+template<> inline int X<int>::a = 10;
+int &use3 = X<int>::a;
+template<> int X<int>::b = 20;
+template<> inline int X<int>::c = 30;
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK: call i32 @_Z1fv
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK-NOT: comdat
+// CHECK-SAME: {{$}}
+// CHECK: load atomic {{.*}} acquire
+// CHECK: br
+// CHECK: __cxa_guard_acquire(i64* @_ZGV1b)
+// CHECK: br
+// CHECK: call i32 @_Z1fv
+// CHECK: __cxa_guard_release(i64* @_ZGV1b)
+
+// CHECK-LABEL: define {{.*}}global_var_init
+// CHECK: call i32 @_Z1fv
+
+template<typename T> inline int d = f();
+int e = d<int>;
+
+// CHECK-LABEL: define {{.*}}global_var_init{{.*}}comdat
+// CHECK: _ZGV1dIiE
+// CHECK-NOT: __cxa_guard_acquire(i64* @_ZGV1b)
+// CHECK: call i32 @_Z1fv
+// CHECK-NOT: __cxa_guard_release(i64* @_ZGV1b)
diff --git a/test/CodeGenCXX/cxx1z-lambda-star-this.cpp b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
new file mode 100644
index 0000000000000..a7e4aadbd573f
--- /dev/null
+++ b/test/CodeGenCXX/cxx1z-lambda-star-this.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++1y -triple i686-pc-windows-msvc -emit-llvm %s -o - | FileCheck %s
+//CHECK: %[[A_LAMBDA:.*]] = type { %struct.A }
+//CHECK: %[[B_LAMBDA:.*]] = type { %struct.B* }
+struct A {
+  double a = 111;
+  auto foo() { return [*this] { return a; }; }
+};
+
+namespace ns1 {
+int X = A{}.foo()();
+} //end ns1
+
+//CHECK: @"\01?foo@A@@QAE?A?<auto>@@XZ"(%struct.A* %this, %class.anon* noalias sret %[[A_LAMBDA_RETVAL:.*]])
+// get the first object with the closure type, which is of type 'struct.A'
+//CHECK: %[[I0:.+]] = getelementptr inbounds %[[A_LAMBDA]], %[[A_LAMBDA]]* %[[A_LAMBDA_RETVAL]], i32 0, i32 0
+//CHECK: %[[I1:.+]] = bitcast %struct.A* %[[I0]] to i8*
+//CHECK: %[[I2:.+]] = bitcast %struct.A* %this1 to i8*
+// copy the contents ...
+//CHECK: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[I1]], i8* %[[I2]], i32 8, i32 8, i1 false)
+
+struct B {
+  double b = 222;
+  auto bar() { return [this] { return b; }; };
+};
+
+namespace ns2 {
+int X = B{}.bar()();
+}
+//CHECK: @"\01?bar@B@@QAE?A?<auto>@@XZ"(%struct.B* %this, %class.anon.0* noalias sret %agg.result)
+//CHECK: %[[I20:.+]] = getelementptr inbounds %class.anon.0, %class.anon.0* %agg.result, i32 0, i32 0
+//CHECK: store %struct.B* %this1, %struct.B** %[[I20]], align 4
diff --git a/test/CodeGenCXX/debug-info-access.cpp b/test/CodeGenCXX/debug-info-access.cpp
index 1699bab961047..41b7f71fcecbb 100644
--- a/test/CodeGenCXX/debug-info-access.cpp
+++ b/test/CodeGenCXX/debug-info-access.cpp
@@ -1,13 +1,16 @@
 // RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -triple %itanium_abi_triple %s -o - | FileCheck %s
 // Test the various accessibility flags in the debug info.
 struct A {
+  // CHECK: ![[A:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "A",
+
   // CHECK-DAG: !DISubprogram(name: "pub_default",{{.*}} line: [[@LINE+1]],{{.*}} flags: DIFlagPrototyped,
   void pub_default();
   // CHECK-DAG: !DIDerivedType(tag: DW_TAG_member, name: "pub_default_static",{{.*}} line: [[@LINE+1]],{{.*}} flags: DIFlagStaticMember)
   static int pub_default_static;
 };
 
-// CHECK: !DIDerivedType(tag: DW_TAG_inheritance,{{.*}} baseType: !"_ZTS1A",{{.*}} flags: DIFlagPublic)
+
+// CHECK: !DIDerivedType(tag: DW_TAG_inheritance,{{.*}} baseType: ![[A]],{{.*}} flags: DIFlagPublic)
 class B : public A {
 public:
   // CHECK-DAG: !DISubprogram(name: "pub",{{.*}} line: [[@LINE+1]],{{.*}} flags: DIFlagPublic | DIFlagPrototyped,
diff --git a/test/CodeGenCXX/debug-info-anon-union-vars.cpp b/test/CodeGenCXX/debug-info-anon-union-vars.cpp
index 5b0370eb749d8..b844d429447b7 100644
--- a/test/CodeGenCXX/debug-info-anon-union-vars.cpp
+++ b/test/CodeGenCXX/debug-info-anon-union-vars.cpp
@@ -44,8 +44,8 @@ void instantiate(int x) {
   buildBytes(x);
 }
 
-// CHECK: [[FILE:.*]] = !DIFile(filename: "{{.*}}debug-info-anon-union-vars.cpp",
-// CHECK: !DIGlobalVariable(name: "c",{{.*}} file: [[FILE]], line: 6,{{.*}} isLocal: true, isDefinition: true
+// CHECK: !DIGlobalVariable(name: "c",{{.*}} file: [[FILE:.*]], line: 6,{{.*}} isLocal: true, isDefinition: true
+// CHECK: [[FILE]] = !DIFile(filename: "{{.*}}debug-info-anon-union-vars.cpp",
 // CHECK: !DIGlobalVariable(name: "d",{{.*}} file: [[FILE]], line: 6,{{.*}} isLocal: true, isDefinition: true
 // CHECK: !DIGlobalVariable(name: "a",{{.*}} file: [[FILE]], line: 6,{{.*}} isLocal: true, isDefinition: true
 // CHECK: !DIGlobalVariable(name: "b",{{.*}} file: [[FILE]], line: 6,{{.*}} isLocal: true, isDefinition: true
@@ -56,7 +56,7 @@ void instantiate(int x) {
 // CHECK: !DILocalVariable(
 // CHECK-NOT: name:
 // CHECK: type: ![[UNION:[0-9]+]]
-// CHECK: ![[UNION]] = !DICompositeType(tag: DW_TAG_union_type,
+// CHECK: ![[UNION]] = distinct !DICompositeType(tag: DW_TAG_union_type,
 // CHECK-NOT: name:
 // CHECK: elements
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "i", scope: ![[UNION]],
diff --git a/test/CodeGenCXX/debug-info-artificial-arg.cpp b/test/CodeGenCXX/debug-info-artificial-arg.cpp
index c840df672aa02..a0cf131f83e15 100644
--- a/test/CodeGenCXX/debug-info-artificial-arg.cpp
+++ b/test/CodeGenCXX/debug-info-artificial-arg.cpp
@@ -22,11 +22,10 @@ int main(int argc, char **argv) {
   A reallyA (500);
 }
 
-// CHECK: ![[CLASSTYPE:.*]] = !DICompositeType(tag: DW_TAG_class_type, name: "A",
+// CHECK: ![[CLASSTYPE:.*]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A",
 // CHECK-SAME:                                 identifier: "_ZTS1A"
-// CHECK: ![[ARTARG:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !"_ZTS1A",
-// CHECK-SAME:                            DIFlagArtificial
-// CHECK: !DISubprogram(name: "A", scope: !"_ZTS1A"
+// CHECK: ![[ARTARG:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[CLASSTYPE]],{{.*}} DIFlagArtificial
+// CHECK: !DISubprogram(name: "A", scope: ![[CLASSTYPE]]
 // CHECK-SAME:          line: 12
 // CHECK-SAME:          DIFlagPublic
 // CHECK: !DISubroutineType(types: [[FUNCTYPE:![0-9]*]])
diff --git a/test/CodeGenCXX/debug-info-calling-conventions.cpp b/test/CodeGenCXX/debug-info-calling-conventions.cpp
new file mode 100644
index 0000000000000..51d801e35ef68
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-calling-conventions.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 %s -triple=i686-pc-windows-msvc -debug-info-kind=limited -emit-llvm -o - | FileCheck %s
+
+struct A {
+  void thiscallcc();
+};
+void A::thiscallcc() {}
+
+// CHECK: !DISubprogram(name: "thiscallcc", {{.*}} type: ![[thiscallty:[^,]*]], {{.*}})
+// CHECK: ![[thiscallty]] = !DISubroutineType(cc: DW_CC_BORLAND_thiscall, types: ![[thisargs:[^,)]*]])
+// CHECK: ![[thisargs]] = !{null, ![[thisptrty:[^,}]*]]}
+// CHECK: ![[thisptrty]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{.*}}, size: 32, align: 32, flags: DIFlagArtificial | DIFlagObjectPointer)
+
+void cdeclcc() {}
+void __fastcall fastcallcc() {}
+void __stdcall stdcallcc() {}
+void __vectorcall vectorcallcc() {}
+
+// CHECK: !DISubprogram(name: "cdeclcc", {{.*}} type: ![[cdeclty:[^,]*]], {{.*}})
+// CHECK: ![[cdeclty]] = !DISubroutineType(types: ![[noargs:[^,)]*]])
+// CHECK: ![[noargs]] = !{null}
+// CHECK: !DISubprogram(name: "fastcallcc", {{.*}} type: ![[fastcallty:[^,]*]], {{.*}})
+// CHECK: ![[fastcallty]] = !DISubroutineType(cc: DW_CC_BORLAND_msfastcall, types: ![[noargs]])
+// CHECK: !DISubprogram(name: "stdcallcc", {{.*}} type: ![[stdcallty:[^,]*]], {{.*}})
+// CHECK: ![[stdcallty]] = !DISubroutineType(cc: DW_CC_BORLAND_stdcall, types: ![[noargs]])
+// CHECK: !DISubprogram(name: "vectorcallcc", {{.*}} type: ![[vectorcallty:[^,]*]], {{.*}})
+// CHECK: ![[vectorcallty]] = !DISubroutineType(cc: DW_CC_LLVM_vectorcall, types: ![[noargs]])
diff --git a/test/CodeGenCXX/debug-info-class-limited-plugin.test b/test/CodeGenCXX/debug-info-class-limited-plugin.test
index 61d258d9ffc49..533c2f6b16c54 100644
--- a/test/CodeGenCXX/debug-info-class-limited-plugin.test
+++ b/test/CodeGenCXX/debug-info-class-limited-plugin.test
@@ -1,2 +1,2 @@
-RUN: %clang_cc1 -emit-llvm -fno-standalone-debug -g -o - -load %llvmshlibdir/PrintFunctionNames%pluginext -add-plugin print-function-names %S/Inputs/debug-info-class-limited.cpp 2>&1 | FileCheck %S/Inputs/debug-info-class-limited.cpp
+RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -o - -load %llvmshlibdir/PrintFunctionNames%pluginext -add-plugin print-function-names %S/Inputs/debug-info-class-limited.cpp 2>&1 | FileCheck %S/Inputs/debug-info-class-limited.cpp
 REQUIRES: plugins, examples
diff --git a/test/CodeGenCXX/debug-info-class-limited.test b/test/CodeGenCXX/debug-info-class-limited.test
index 0b10728f3c979..c2e332866f9fa 100644
--- a/test/CodeGenCXX/debug-info-class-limited.test
+++ b/test/CodeGenCXX/debug-info-class-limited.test
@@ -1 +1 @@
-RUN: %clang_cc1 -emit-llvm -fno-standalone-debug -g %S/Inputs/debug-info-class-limited.cpp -o - | FileCheck %S/Inputs/debug-info-class-limited.cpp
+RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited %S/Inputs/debug-info-class-limited.cpp -o - | FileCheck %S/Inputs/debug-info-class-limited.cpp
diff --git a/test/CodeGenCXX/debug-info-class.cpp b/test/CodeGenCXX/debug-info-class.cpp
index a63efe5d780a3..d03c0845dd6b5 100644
--- a/test/CodeGenCXX/debug-info-class.cpp
+++ b/test/CodeGenCXX/debug-info-class.cpp
@@ -83,13 +83,22 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// RUN: %clang -target x86_64-unknown_unknown -emit-llvm -g -S %s -o - | FileCheck %s
-// RUN: %clang -target i686-cygwin -emit-llvm -g -S %s -o - | FileCheck %s
-// RUN: %clang -target armv7l-unknown-linux-gnueabihf -emit-llvm -g -S %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions %s -o - | FileCheck %s
 
 // CHECK: invoke {{.+}} @_ZN1BD1Ev(%class.B* %b)
 // CHECK-NEXT: unwind label %{{.+}}, !dbg ![[EXCEPTLOC:.*]]
 // CHECK: store i32 0, i32* %{{.+}}, !dbg ![[RETLOC:.*]]
+
+// CHECK: [[F:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "F"
+// CHECK-SAME:                             DIFlagFwdDecl
+// CHECK-SAME:                             identifier: "_ZTS1F"
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "I"
+// CHECK-NOT:              DIFlagFwdDecl
+// CHECK-SAME:             ){{$}}
+
+// CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
 // CHECK: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
@@ -99,12 +108,10 @@ int main(int argc, char **argv) {
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
 // CHECK-SAME:           DIFlagArtificial
 
-// CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
-
-// CHECK: [[C:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "C",
+// CHECK: [[C:![0-9]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "C",
 // CHECK-NOT:                              DIFlagFwdDecl
 // CHECK-SAME:                             elements: [[C_MEM:![0-9]*]]
-// CHECK-SAME:                             vtableHolder: !"_ZTS1C"
+// CHECK-SAME:                             vtableHolder: [[C]]
 // CHECK-SAME:                             identifier: "_ZTS1C"
 // CHECK: [[C_MEM]] = !{[[C_VPTR:![0-9]*]], [[C_S:![0-9]*]], [[C_DTOR:![0-9]*]]}
 // CHECK: [[C_VPTR]] = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$C"
@@ -114,39 +121,33 @@ int main(int argc, char **argv) {
 // CHECK-SAME:                     DIFlagStaticMember
 // CHECK: [[C_DTOR]] = !DISubprogram(name: "~C"
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "D"
+// CHECK: [[D:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "D"
 // CHECK-SAME:             DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTS1D"
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "E"
 // CHECK-SAME:             DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTS1E"
-// CHECK: [[F:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "F"
-// CHECK-SAME:                             DIFlagFwdDecl
-// CHECK-SAME:                             identifier: "_ZTS1F"
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "G"
-// CHECK-SAME:             DIFlagFwdDecl
-// CHECK-SAME:             identifier: "_ZTS1G"
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "inner"
-// CHECK: line: 50
+// CHECK: !DISubprogram(name: "func",{{.*}} scope: [[D]]
+// CHECK-SAME:          isDefinition: true
+// CHECK-SAME:          declaration: [[D_FUNC_DECL:![0-9]*]]
+// CHECK: [[D_FUNC_DECL]] = !DISubprogram(name: "func",{{.*}} scope: [[D]]
+// CHECK-SAME:                            isDefinition: false
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "inner",{{.*}} line: 50
 // CHECK-NOT: DIFlagFwdDecl
 // CHECK-SAME: elements: [[G_INNER_MEM:![0-9]*]]
 // CHECK-SAME: identifier: "_ZTSN1G5innerE"
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "G"
+// CHECK-SAME:             DIFlagFwdDecl
+// CHECK-SAME:             identifier: "_ZTS1G"
 // CHECK: [[G_INNER_MEM]] = !{[[G_INNER_I:![0-9]*]]}
 // CHECK: [[G_INNER_I]] = !DIDerivedType(tag: DW_TAG_member, name: "j"
 // CHECK-SAME:                           baseType: ![[INT]]
 
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "A"
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "HdrSize"
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "I"
-// CHECK-NOT:              DIFlagFwdDecl
-// CHECK-SAME:             ){{$}}
 //
-// CHECK: !DISubprogram(name: "func",{{.*}} scope: !"_ZTS1D"
-// CHECK-SAME:          isDefinition: true
-// CHECK-SAME:          declaration: [[D_FUNC_DECL:![0-9]*]]
-// CHECK: [[D_FUNC_DECL]] = !DISubprogram(name: "func",{{.*}} scope: !"_ZTS1D"
-// CHECK-SAME:                            isDefinition: false
-
 // CHECK: ![[EXCEPTLOC]] = !DILocation(line: 84,
 // CHECK: ![[RETLOC]] = !DILocation(line: 83,
diff --git a/test/CodeGenCXX/debug-info-codeview-display-name.cpp b/test/CodeGenCXX/debug-info-codeview-display-name.cpp
index 1d0300c76c01b..b1b5a1e9acb8e 100644
--- a/test/CodeGenCXX/debug-info-codeview-display-name.cpp
+++ b/test/CodeGenCXX/debug-info-codeview-display-name.cpp
@@ -1,14 +1,22 @@
-// RUN: %clang_cc1 -fblocks -debug-info-kind=limited -gcodeview -emit-llvm %s -o - -triple=x86_64-pc-win32 -std=c++98 | \
-// RUN:  grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | FileCheck %s
+// RUN: %clang_cc1 -fblocks -debug-info-kind=limited -gcodeview -emit-llvm %s \
+// RUN:       -o - -triple=x86_64-pc-win32 -std=c++98 | \
+// RUN:    grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | \
+// RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=UNQUAL
+// RUN: %clang_cc1 -fblocks -debug-info-kind=line-tables-only -gcodeview -emit-llvm %s \
+// RUN:       -o - -triple=x86_64-pc-win32 -std=c++98 | \
+// RUN:    grep 'DISubprogram' | sed -e 's/.*name: "\([^"]*\)".*/"\1"/' | \
+// RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=QUAL
 
 void freefunc() { }
 // CHECK-DAG: "freefunc"
 
 namespace N {
   int b() { return 0; }
-// CHECK-DAG: "N::b"
+// UNQUAL-DAG: "b"
+// QUAL-DAG: "N::b"
   namespace { void func() { } }
-// CHECK-DAG: "N::`anonymous namespace'::func
+// UNQUAL-DAG: "func"
+// QUAL-DAG: "N::`anonymous namespace'::func"
 }
 
 void _c(void) {
@@ -19,19 +27,24 @@ void _c(void) {
 struct foo {
   int operator+(int);
   foo(){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   ~foo(){}
-// CHECK-DAG: "foo::~foo"
+// UNQUAL-DAG: "~foo"
+// QUAL-DAG: "foo::~foo"
 
   foo(int i){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   foo(char *q){}
-// CHECK-DAG: "foo::foo"
+// UNQUAL-DAG: "foo"
+// QUAL-DAG: "foo::foo"
 
   static foo* static_method() { return 0; }
-// CHECK-DAG: "foo::static_method"
+// UNQUAL-DAG: "static_method"
+// QUAL-DAG: "foo::static_method"
 
 };
 
@@ -40,7 +53,8 @@ void use_foo() {
   foo::static_method();
 }
 
-// CHECK-DAG: "foo::operator+"
+// UNQUAL-DAG: "operator+"
+// QUAL-DAG: "foo::operator+"
 int foo::operator+(int a) { return a; }
 
 // PR17371
@@ -60,14 +74,20 @@ void OverloadedNewDelete::operator delete(void *) { }
 void OverloadedNewDelete::operator delete[](void *) { }
 int OverloadedNewDelete::operator+(int x) { return x; };
 
-// CHECK-DAG: "OverloadedNewDelete::operator new"
-// CHECK-DAG: "OverloadedNewDelete::operator new[]"
-// CHECK-DAG: "OverloadedNewDelete::operator delete"
-// CHECK-DAG: "OverloadedNewDelete::operator delete[]"
-// CHECK-DAG: "OverloadedNewDelete::operator+"
+// UNQUAL-DAG: "operator new"
+// UNQUAL-DAG: "operator new[]"
+// UNQUAL-DAG: "operator delete"
+// UNQUAL-DAG: "operator delete[]"
+// UNQUAL-DAG: "operator+"
+// QUAL-DAG: "OverloadedNewDelete::operator new"
+// QUAL-DAG: "OverloadedNewDelete::operator new[]"
+// QUAL-DAG: "OverloadedNewDelete::operator delete"
+// QUAL-DAG: "OverloadedNewDelete::operator delete[]"
+// QUAL-DAG: "OverloadedNewDelete::operator+"
 
-template <void (*)(void)>
+
+template <typename T, void (*)(void)>
 void fn_tmpl() {}
 
-template void fn_tmpl<freefunc>();
-// CHECK-DAG: "fn_tmpl"
+template void fn_tmpl<int, freefunc>();
+// CHECK-DAG: "fn_tmpl<int,&freefunc>"
diff --git a/test/CodeGenCXX/debug-info-cxx1y.cpp b/test/CodeGenCXX/debug-info-cxx1y.cpp
index 37f95959911cd..faf29d3ed11b4 100644
--- a/test/CodeGenCXX/debug-info-cxx1y.cpp
+++ b/test/CodeGenCXX/debug-info-cxx1y.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only -std=c++14 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
 
 // CHECK: [[EMPTY:![0-9]*]] = !{}
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
+// CHECK: [[FOO:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
 // CHECK-SAME:             elements: [[EMPTY]]
 // FIXME: The context of this definition should be the CU/file scope, not the class.
-// CHECK: !DISubprogram(name: "func", {{.*}} scope: !"_ZTS3foo"
+// CHECK: !DISubprogram(name: "func", {{.*}} scope: [[FOO]]
 // CHECK-SAME:          type: [[SUBROUTINE_TYPE:![0-9]*]]
 // CHECK-SAME:          isDefinition: true
 // CHECK-SAME:          declaration: [[FUNC_DECL:![0-9]*]]
@@ -12,7 +12,7 @@
 // CHECK: [[TYPE_LIST]] = !{[[INT:![0-9]*]]}
 // CHECK: [[INT]] = !DIBasicType(name: "int"
 // CHECK: [[FUNC_DECL]] = !DISubprogram(name: "func",
-// CHECK-SAME:                          scope: !"_ZTS3foo"
+// CHECK-SAME:                          scope: [[FOO]]
 // CHECK-SAME:                          type: [[SUBROUTINE_TYPE]]
 // CHECK-SAME:                          isDefinition: false
 
diff --git a/test/CodeGenCXX/debug-info-enum-class.cpp b/test/CodeGenCXX/debug-info-enum-class.cpp
index 71e6e2b2574e2..44daf412d3ac6 100644
--- a/test/CodeGenCXX/debug-info-enum-class.cpp
+++ b/test/CodeGenCXX/debug-info-enum-class.cpp
@@ -4,6 +4,7 @@ enum class A { A1=1 };                 // underlying type is int by default
 enum class B: unsigned long { B1=1 };  // underlying type is unsigned long
 enum C { C1 = 1 };
 enum D : short; // enum forward declaration
+enum Z : int;
 A a;
 B b;
 C c;
@@ -94,12 +95,17 @@ void f2(E) {
 // CHECK-NOT:              offset:
 // CHECK-SAME:             flags: DIFlagFwdDecl
 
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Z"
+// CHECK-NOT:              scope:
+// CHECK-SAME:             flags: DIFlagFwdDecl
+void fz() { Z z; }
+
 namespace test5 {
+// CHECK: [[TEST5:![0-9]+]] = !DINamespace(name: "test5"
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "E"
-// CHECK-SAME:             scope: [[TEST5:![0-9]+]]
+// CHECK-SAME:             scope: [[TEST5]]
 // CHECK-SAME:             flags: DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTSN5test51EE"
-// CHECK: [[TEST5]] = !DINamespace(name: "test5"
 enum E : int;
 void f1(E *) {
 }
diff --git a/test/CodeGenCXX/debug-info-function-context.cpp b/test/CodeGenCXX/debug-info-function-context.cpp
index 24f9f1bda639e..1db62d96d5b6c 100644
--- a/test/CodeGenCXX/debug-info-function-context.cpp
+++ b/test/CodeGenCXX/debug-info-function-context.cpp
@@ -26,11 +26,12 @@ int global_namespace_variable = 1;
 // function has the file as a context.
 
 // CHECK: ![[FILE:[0-9]+]] = !DIFile(filename: "{{.*}}context.cpp",
-// CHECK: !DISubprogram(name: "member_function",{{.*}} scope: !"_ZTS1C",{{.*}} isDefinition: true
+// CHECK: ![[C:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "C",
+// CHECK: ![[NS:.*]] = !DINamespace(name: "ns"
+// CHECK: !DISubprogram(name: "member_function",{{.*}} scope: ![[C]],{{.*}} isDefinition: true
 
-// CHECK: !DISubprogram(name: "static_member_function",{{.*}} scope: !"_ZTS1C",{{.*}} isDefinition: true
+// CHECK: !DISubprogram(name: "static_member_function",{{.*}} scope: ![[C]],{{.*}} isDefinition: true
 
 // CHECK: !DISubprogram(name: "global_function",{{.*}} scope: ![[FILE]],{{.*}} isDefinition: true
 
-// CHECK: !DISubprogram(name: "global_namespace_function",{{.*}} scope: ![[NS:[0-9]+]],{{.*}} isDefinition: true
-// CHECK: ![[NS]] = !DINamespace(name: "ns"
+// CHECK: !DISubprogram(name: "global_namespace_function",{{.*}} scope: ![[NS]],{{.*}} isDefinition: true
diff --git a/test/CodeGenCXX/debug-info-global.cpp b/test/CodeGenCXX/debug-info-global.cpp
index 920db82409bc2..795602380d20f 100644
--- a/test/CodeGenCXX/debug-info-global.cpp
+++ b/test/CodeGenCXX/debug-info-global.cpp
@@ -15,7 +15,7 @@ int f1() {
 
 // CHECK: [[GLOBALS]] = !{[[CNST:![0-9]*]]}
 
-// CHECK: [[CNST]] = !DIGlobalVariable(name: "cnst",
-// CHECK-SAME:                         scope: [[NS:![0-9]*]]
+// CHECK: [[CNST]] = distinct !DIGlobalVariable(name: "cnst",
+// CHECK-SAME:                                  scope: [[NS:![0-9]*]]
 // CHECK: [[NS]] = !DINamespace(name: "ns"
 
diff --git a/test/CodeGenCXX/debug-info-indirect-field-decl.cpp b/test/CodeGenCXX/debug-info-indirect-field-decl.cpp
index 19f8d01e1ab6c..70b233cac4dcb 100644
--- a/test/CodeGenCXX/debug-info-indirect-field-decl.cpp
+++ b/test/CodeGenCXX/debug-info-indirect-field-decl.cpp
@@ -9,9 +9,10 @@ struct Bar {
   int i1;
   // CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
   // CHECK: !DIDerivedType(tag: DW_TAG_member, scope:
-  // CHECK-SAME:           line: [[@LINE+3]]
-  // CHECK-SAME:           baseType: !"_ZTSN3BarUt_E"
+  // CHECK-SAME:           line: [[@LINE+4]]
+  // CHECK-SAME:           baseType: ![[UNION:[0-9]+]]
   // CHECK-SAME:           size: 32, align: 32, offset: 32
+  // CHECK: ![[UNION]] = distinct !DICompositeType(tag: DW_TAG_union_type,{{.*}} identifier: "_ZTSN3BarUt_E")
   union {
     // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "i2",
     // CHECK-SAME:           line: [[@LINE+5]]
diff --git a/test/CodeGenCXX/debug-info-limited.cpp b/test/CodeGenCXX/debug-info-limited.cpp
index b209e3a850de5..4467d20f3de4b 100644
--- a/test/CodeGenCXX/debug-info-limited.cpp
+++ b/test/CodeGenCXX/debug-info-limited.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang -flimit-debug-info -emit-llvm -g -S %s -o - | FileCheck %s
+// RUN: %clang -flimit-debug-info -emit-llvm -g -S %s -o - | FileCheck --check-prefix=CHECK-C %s
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A"
 // CHECK-NOT:              DIFlagFwdDecl
@@ -27,8 +28,8 @@ int baz(B *b) {
 }
 
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "C"
-// CHECK-SAME:             flags: DIFlagFwdDecl
+// CHECK-C: !DICompositeType(tag: DW_TAG_structure_type, name: "C"
+// CHECK-C-SAME:             flags: DIFlagFwdDecl
 
 struct C {
 };
diff --git a/test/CodeGenCXX/debug-info-line-if.cpp b/test/CodeGenCXX/debug-info-line-if.cpp
index 29806351c94c5..b3f9c32e09116 100644
--- a/test/CodeGenCXX/debug-info-line-if.cpp
+++ b/test/CodeGenCXX/debug-info-line-if.cpp
@@ -15,7 +15,7 @@ int main() {
 
   // CHECK: br label
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG1:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG1:![0-9]*]], !llvm.loop [[L1:![0-9]*]]
 
 #line 200
   while (a)
@@ -25,7 +25,7 @@ int main() {
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG2:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG2:![0-9]*]], !llvm.loop [[L2:![0-9]*]]
 
 #line 300
   for (; a; )
@@ -35,7 +35,7 @@ int main() {
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG3:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG3:![0-9]*]], !llvm.loop [[L3:![0-9]*]]
 
 #line 400
   int x[] = {1, 2};
@@ -46,10 +46,22 @@ int main() {
       ++a; // CHECK: add nsw{{.*}}, 1
 
   // CHECK: br label
-  // CHECK: br label {{.*}}, !dbg [[DBG4:!.*]]
+  // CHECK: br label {{.*}}, !dbg [[DBG4:![0-9]*]], !llvm.loop [[L4:![0-9]*]]
 
-  // CHECK: [[DBG1]] = !DILocation(line: 100, scope: !{{.*}})
-  // CHECK: [[DBG2]] = !DILocation(line: 200, scope: !{{.*}})
-  // CHECK: [[DBG3]] = !DILocation(line: 300, scope: !{{.*}})
-  // CHECK: [[DBG4]] = !DILocation(line: 401, scope: !{{.*}})
+  // CHECK-DAG: [[DBG1]] = !DILocation(line: 100, scope: !{{.*}})
+  // CHECK-DAG: [[DBG2]] = !DILocation(line: 200, scope: !{{.*}})
+  // CHECK-DAG: [[DBG3]] = !DILocation(line: 300, scope: !{{.*}})
+  // CHECK-DAG: [[DBG4]] = !DILocation(line: 401, scope: !{{.*}})
+
+  // CHECK-DAG: [[L1]] = distinct !{[[L1]], [[LDBG1:![0-9]*]]}
+  // CHECK-DAG: [[LDBG1]] = !DILocation(line: 100, scope: !{{.*}})
+
+  // CHECK-DAG: [[L2]] = distinct !{[[L2]], [[LDBG2:![0-9]*]]}
+  // CHECK-DAG: [[LDBG2]] = !DILocation(line: 200, scope: !{{.*}})
+
+  // CHECK-DAG: [[L3]] = distinct !{[[L3]], [[LDBG3:![0-9]*]]}
+  // CHECK-DAG: [[LDBG3]] = !DILocation(line: 300, scope: !{{.*}})
+
+  // CHECK-DAG: [[L4]] = distinct !{[[L4]], [[LDBG4:![0-9]*]]}
+  // CHECK-DAG: [[LDBG4]] = !DILocation(line: 401, scope: !{{.*}})
 }
diff --git a/test/CodeGenCXX/debug-info-line.cpp b/test/CodeGenCXX/debug-info-line.cpp
index 9fb6ba8ac70ef..11653040109cd 100644
--- a/test/CodeGenCXX/debug-info-line.cpp
+++ b/test/CodeGenCXX/debug-info-line.cpp
@@ -1,8 +1,6 @@
-// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - -triple %itanium_abi_triple | FileCheck %s
 // RUN: %clang_cc1 -w -debug-info-kind=line-tables-only -std=c++11 -fexceptions -fcxx-exceptions -S -emit-llvm %s -o - -triple i686-linux-gnu | FileCheck %s
 
-// XFAIL: win32
-
 int &src();
 int *sink();
 extern "C" __complex float complex_src();
diff --git a/test/CodeGenCXX/debug-info-member-call.cpp b/test/CodeGenCXX/debug-info-member-call.cpp
new file mode 100644
index 0000000000000..3b5adb8e4b826
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-member-call.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=standalone -dwarf-column-info %s -o - | FileCheck %s
+void ext();
+
+struct Bar {
+  void bar() { ext(); }
+};
+
+struct Foo {
+  Bar *b;
+
+  Bar *foo() { return b; }
+};
+
+void test(Foo *f) {
+  f->foo()->bar();
+}
+
+// CHECK-LABEL: @_Z4testP3Foo
+// CHECK: call {{.*}} @_ZN3Foo3fooEv{{.*}}, !dbg ![[CALL1LOC:.*]]
+// CHECK: call void @_ZN3Bar3barEv{{.*}}, !dbg ![[CALL2LOC:.*]]
+
+// CHECK: ![[CALL1LOC]] = !DILocation(line: [[LINE:[0-9]+]], column: 6,
+// CHECK: ![[CALL2LOC]] = !DILocation(line: [[LINE]], column: 13,
+
diff --git a/test/CodeGenCXX/debug-info-method.cpp b/test/CodeGenCXX/debug-info-method.cpp
index bdd14e0b735df..73d8b928269ae 100644
--- a/test/CodeGenCXX/debug-info-method.cpp
+++ b/test/CodeGenCXX/debug-info-method.cpp
@@ -1,12 +1,12 @@
 // RUN: %clang_cc1 -emit-llvm -triple %itanium_abi_triple -std=c++11 -debug-info-kind=limited %s -o - | FileCheck %s
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A",{{.*}} identifier: "_ZTS1A")
+// CHECK: ![[A:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A",{{.*}} identifier: "_ZTS1A")
 // CHECK: !DISubprogram(name: "foo", linkageName: "_ZN1A3fooEiS_3$_0"
 // CHECK-SAME:          DIFlagProtected
-// CHECK: ![[THISTYPE:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !"_ZTS1A"
+// CHECK: ![[THISTYPE:[0-9]+]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[A]]
 // CHECK-SAME:                                  DIFlagArtificial
 // CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type
 // CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: ![[MEMFUNTYPE:[0-9]+]]
-// CHECK: ![[MEMFUNTYPE]] = !DISubroutineType(types: ![[MEMFUNARGS:[0-9]+]])
+// CHECK: ![[MEMFUNTYPE]] = !DISubroutineType({{(cc: DW_CC_BORLAND_thiscall, )?}}types: ![[MEMFUNARGS:[0-9]+]])
 // CHECK: ![[MEMFUNARGS]] = {{.*}}, ![[THISTYPE]],
 // CHECK: !DILocalVariable(name: "this", arg: 1
 // CHECK: !DILocalVariable(arg: 2
diff --git a/test/CodeGenCXX/debug-info-ms-abi.cpp b/test/CodeGenCXX/debug-info-ms-abi.cpp
new file mode 100644
index 0000000000000..a146ce94176e9
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-abi.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 %s -triple=i686-pc-windows-msvc -debug-info-kind=limited -emit-llvm -o - | FileCheck %s
+
+// Tests that certain miscellaneous features work in the MS ABI.
+
+struct Foo {
+  virtual void f();
+  virtual void g();
+  virtual void h();
+  struct Nested {};
+};
+Foo f;
+Foo::Nested n;
+
+// CHECK: ![[Foo:[^ ]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Foo",
+// CHECK-SAME: identifier: ".?AUFoo@@"
+
+// CHECK: !DISubprogram(name: "f",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 0,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
+
+// CHECK: !DISubprogram(name: "g",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 1,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
+
+// CHECK: !DISubprogram(name: "h",
+// CHECK-SAME: containingType: ![[Foo]], virtuality: DW_VIRTUALITY_virtual, virtualIndex: 2,
+// CHECK-SAME: flags: DIFlagPrototyped | DIFlagIntroducedVirtual,
+
+// CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Nested",
+// CHECK-SAME: identifier: ".?AUNested@Foo@@"
diff --git a/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp b/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp
new file mode 100644
index 0000000000000..cef1eb8c5a09a
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-anonymous-tag.cpp
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-pc-win32 -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+typedef struct {
+} test1;
+
+test1 gv1;
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "test1"
+
+struct {
+} test2;
+void *use_test2 = &test2;
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "<unnamed-type-test2>"
+
+typedef struct {
+} *test3;
+test3 gv3;
+void *use_test3 = &gv3;
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "<unnamed-type-test3>"
diff --git a/test/CodeGenCXX/debug-info-ms-bitfields.cpp b/test/CodeGenCXX/debug-info-ms-bitfields.cpp
new file mode 100644
index 0000000000000..07d4c0c6c78a0
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-bitfields.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -triple x86_64-pc-win32 -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+#pragma pack(1)
+struct S {
+  char : 8;
+  short   : 8;
+  short x : 8;
+} s;
+
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "x", {{.*}}, size: 8, align: 16, offset: 16, flags: DIFlagBitField, extraData: i64 8)
diff --git a/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp b/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp
new file mode 100644
index 0000000000000..4b9f2a146896b
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-ms-ptr-to-member.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -triple x86_64-windows -debug-info-kind=limited -gcodeview %s -emit-llvm -o - | FileCheck %s
+
+// Test member pointer inheritance models.
+
+struct A { int a; };
+struct B { int b; };
+struct C : A, B { int c; };
+struct D : virtual C { int d; };
+struct E;
+int A::*pmd_a;
+int C::*pmd_b;
+int D::*pmd_c;
+int E::*pmd_d;
+void (A::*pmf_a)();
+void (C::*pmf_b)();
+void (D::*pmf_c)();
+void (E::*pmf_d)();
+
+// Test incomplete MPTs, which don't have inheritance models.
+
+struct Incomplete;
+int Incomplete::**ppmd;
+void (Incomplete::**ppmf)();
+
+// CHECK: distinct !DIGlobalVariable(name: "pmd_a", {{.*}} type: ![[pmd_a:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_a]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 32, flags: DIFlagSingleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_b", {{.*}} type: ![[pmd_b:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_b]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 32, flags: DIFlagMultipleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_c", {{.*}} type: ![[pmd_c:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_c]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 64, flags: DIFlagVirtualInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmd_d", {{.*}} type: ![[pmd_d:[^, ]*]], {{.*}})
+// CHECK: ![[pmd_d]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 96,
+// CHECK-NOT: flags:
+// CHECK-SAME: ){{$}}
+
+// CHECK: distinct !DIGlobalVariable(name: "pmf_a", {{.*}} type: ![[pmf_a:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_a]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 64, flags: DIFlagSingleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_b", {{.*}} type: ![[pmf_b:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_b]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 128, flags: DIFlagMultipleInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_c", {{.*}} type: ![[pmf_c:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_c]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 128, flags: DIFlagVirtualInheritance, {{.*}})
+// CHECK: distinct !DIGlobalVariable(name: "pmf_d", {{.*}} type: ![[pmf_d:[^, ]*]], {{.*}})
+// CHECK: ![[pmf_d]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{.*}}, size: 192,
+// CHECK-NOT: flags:
+// CHECK-SAME: ){{$}}
+
+// CHECK: distinct !DIGlobalVariable(name: "ppmd", {{.*}} type: ![[ppmd:[^, ]*]], {{.*}})
+// CHECK: ![[ppmd]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[ppmd2:[^ ]*]], size: 64, align: 64)
+// CHECK: ![[ppmd2]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{[0-9]*}}, extraData: !{{[0-9]*}}){{$}}
+// CHECK: distinct !DIGlobalVariable(name: "ppmf", {{.*}} type: ![[ppmf:[^, ]*]], {{.*}})
+// CHECK: ![[ppmf]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[ppmf2:[^ ]*]], size: 64, align: 64)
+// CHECK: ![[ppmf2]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !{{[0-9]*}}, extraData: !{{[0-9]*}}){{$}}
diff --git a/test/CodeGenCXX/debug-info-namespace.cpp b/test/CodeGenCXX/debug-info-namespace.cpp
index 4933ae967452a..060a5cea636c4 100644
--- a/test/CodeGenCXX/debug-info-namespace.cpp
+++ b/test/CodeGenCXX/debug-info-namespace.cpp
@@ -57,57 +57,57 @@ void B::func_fwd() {}
 
 // CHECK: [[CU:![0-9]+]] = distinct !DICompileUnit(
 // CHECK-SAME:                            imports: [[MODULES:![0-9]*]]
-// CHECK: [[FOO:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
-// CHECK-SAME:                               line: 5
-// CHECK-SAME:                               DIFlagFwdDecl
-// CHECK: [[FOOCPP:![0-9]+]] = !DIFile(filename: "foo.cpp"
-// CHECK: [[NS:![0-9]+]] = !DINamespace(name: "B", scope: [[CTXT:![0-9]+]], file: [[FOOCPP]], line: 1)
+// CHECK: [[I:![0-9]+]] = distinct !DIGlobalVariable(name: "i",{{.*}} scope: [[NS:![0-9]+]],
+// CHECK: [[NS]] = !DINamespace(name: "B", scope: [[CTXT:![0-9]+]], file: [[FOOCPP:![0-9]+]], line: 1)
+// CHECK: [[FOOCPP]] = !DIFile(filename: "foo.cpp"
 // CHECK: [[CTXT]] = !DINamespace(name: "A", scope: null, file: [[FILE:![0-9]+]], line: 5)
 // CHECK: [[FILE]] = !DIFile(filename: "{{.*}}debug-info-namespace.cpp",
-// CHECK: [[BAR:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "bar",
-// CHECK-SAME:                               line: 6
-// CHECK-SAME:                               DIFlagFwdDecl
-// CHECK: [[F1:![0-9]+]] = distinct !DISubprogram(name: "f1",{{.*}} line: 4
-// CHECK-SAME:                           isDefinition: true
-// CHECK: [[FUNC:![0-9]+]] = distinct !DISubprogram(name: "func",{{.*}} isDefinition: true
-// CHECK: [[FUNC_FWD:![0-9]+]] = distinct !DISubprogram(name: "func_fwd",{{.*}} line: 47,{{.*}} isDefinition: true
-// CHECK: [[I:![0-9]+]] = !DIGlobalVariable(name: "i",{{.*}} scope: [[NS]],
-// CHECK: [[VAR_FWD:![0-9]+]] = !DIGlobalVariable(name: "var_fwd",{{.*}} scope: [[NS]],
-// CHECK-SAME:                                    line: 44
-// CHECK-SAME:                                    isDefinition: true
-
+// CHECK: [[VAR_FWD:![0-9]+]] = distinct !DIGlobalVariable(name: "var_fwd",{{.*}} scope: [[NS]],
+// CHECK-SAME:                                             line: 44
+// CHECK-SAME:                                             isDefinition: true
 // CHECK: [[MODULES]] = !{[[M1:![0-9]+]], [[M2:![0-9]+]], [[M3:![0-9]+]], [[M4:![0-9]+]], [[M5:![0-9]+]], [[M6:![0-9]+]], [[M7:![0-9]+]], [[M8:![0-9]+]], [[M9:![0-9]+]], [[M10:![0-9]+]], [[M11:![0-9]+]], [[M12:![0-9]+]], [[M13:![0-9]+]], [[M14:![0-9]+]], [[M15:![0-9]+]], [[M16:![0-9]+]], [[M17:![0-9]+]]}
 // CHECK: [[M1]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CTXT]], entity: [[NS]], line: 15)
+
 // CHECK: [[M2]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[CU]], entity: [[CTXT]],
 // CHECK: [[M3]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "E", scope: [[CU]], entity: [[CTXT]], line: 19)
 // CHECK: [[M4]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[LEX2:![0-9]+]], entity: [[NS]], line: 23)
 // CHECK: [[LEX2]] = distinct !DILexicalBlock(scope: [[LEX1:![0-9]+]], file: [[FOOCPP]],
-// CHECK: [[LEX1]] = distinct !DILexicalBlock(scope: [[FUNC]], file: [[FOOCPP]],
-// CHECK: [[M5]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[FUNC]], entity: [[CTXT]],
-// CHECK: [[M6]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FOO:!"_ZTSN1A1B3fooE"]], line: 27)
-// CHECK: [[M7]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[BAR:!"_ZTSN1A1B3barE"]]
-// CHECK: [[M8]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[F1]]
+// CHECK: [[LEX1]] = distinct !DILexicalBlock(scope: [[FUNC:![0-9]+]], file: [[FOOCPP]],
+
+// CHECK: [[FUNC:![0-9]+]] = distinct !DISubprogram(name: "func",{{.*}} isDefinition: true
+// CHECK: [[M5]] = !DIImportedEntity(tag: DW_TAG_imported_module, scope: [[FUNC]], entity: [[CTXT:![0-9]+]],
+// CHECK: [[M6]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FOO:![0-9]+]], line: 27)
+// CHECK: [[FOO]] = !DICompositeType(tag: DW_TAG_structure_type, name: "foo",
+// CHECK-SAME:                               line: 5
+// CHECK-SAME:                               DIFlagFwdDecl
+// CHECK: [[M7]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[BAR:![0-9]+]]
+// CHECK: [[BAR]] = !DICompositeType(tag: DW_TAG_structure_type, name: "bar",
+// CHECK-SAME:                               line: 6
+// CHECK-SAME:                               DIFlagFwdDecl
+
+// CHECK: [[M8]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[F1:![0-9]+]]
+// CHECK: [[F1:![0-9]+]] = distinct !DISubprogram(name: "f1",{{.*}} line: 4
+// CHECK-SAME:                           isDefinition: true
 // CHECK: [[M9]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[I]]
 // CHECK: [[M10]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[BAZ:![0-9]+]]
 // CHECK: [[BAZ]] = !DIDerivedType(tag: DW_TAG_typedef, name: "baz", scope: [[NS]], file: [[FOOCPP]],
-// CHECK-SAME:                     baseType: !"_ZTSN1A1B3barE"
+// CHECK-SAME:                     baseType: [[BAR]]
 // CHECK: [[M11]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "X", scope: [[FUNC]], entity: [[CTXT]]
 // CHECK: [[M12]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, name: "Y", scope: [[FUNC]], entity: [[M11]]
 // CHECK: [[M13]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[VAR_DECL:![0-9]+]]
-// CHECK: [[VAR_DECL]] = !DIGlobalVariable(name: "var_decl", linkageName: "_ZN1A1B8var_declE", scope: [[NS]],{{.*}} line: 8,
+// CHECK: [[VAR_DECL]] = !DIGlobalVariable(name: "var_decl", linkageName: "{{[^"]*var_decl[^"]*}}", scope: [[NS]],{{.*}} line: 8,
 // CHECK: [[M14]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FUNC_DECL:![0-9]+]]
 // CHECK: [[FUNC_DECL]] = !DISubprogram(name: "func_decl",
 // CHECK-SAME:                          scope: [[NS]], file: [[FOOCPP]], line: 9
 // CHECK: [[M15]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[VAR_FWD:![0-9]+]]
 // CHECK: [[M16]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[FUNC]], entity: [[FUNC_FWD:![0-9]+]]
+// CHECK: [[FUNC_FWD]] = distinct !DISubprogram(name: "func_fwd",{{.*}} line: 47,{{.*}} isDefinition: true
 // CHECK: [[M17]] = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: [[CTXT]], entity: [[I]]
 
 // CHECK-GMLT: [[CU:![0-9]+]] = distinct !DICompileUnit(
-// CHECK-GMLT-SAME:                            emissionKind: 2,
+// CHECK-GMLT-SAME:                            emissionKind: LineTablesOnly,
 // CHECK-GMLT-NOT:                             imports:
 
 // CHECK-NOLIMIT: !DICompositeType(tag: DW_TAG_structure_type, name: "bar",{{.*}} line: 6,
 // CHECK-NOLIMIT-NOT:              DIFlagFwdDecl
 // CHECK-NOLIMIT-SAME:             ){{$}}
-
-// REQUIRES: dw2
diff --git a/test/CodeGenCXX/debug-info-nodebug.cpp b/test/CodeGenCXX/debug-info-nodebug.cpp
new file mode 100644
index 0000000000000..9f140efaed6fb
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-nodebug.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -DSETNODEBUG=0 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=YESINFO
+// RUN: %clang_cc1 -DSETNODEBUG=1 -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=NOINFO
+
+#if SETNODEBUG
+#define NODEBUG __attribute__((nodebug))
+#else
+#define NODEBUG
+#endif
+
+// Const global variable. Use it so it gets emitted.
+NODEBUG static const int const_global_int_def = 1;
+void func1(int);
+void func2() { func1(const_global_int_def); }
+// YESINFO-DAG: !DIGlobalVariable(name: "const_global_int_def"
+// NOINFO-NOT:  !DIGlobalVariable(name: "const_global_int_def"
+
+// Global variable with a more involved type.
+// If the variable has no debug info, the type should not appear either.
+struct S1 {
+  int a;
+  int b;
+};
+NODEBUG S1 global_struct = { 2, 3 };
+// YESINFO-DAG: !DICompositeType({{.*}} name: "S1"
+// NOINFO-NOT:  !DICompositeType({{.*}} name: "S1"
+// YESINFO-DAG: !DIGlobalVariable(name: "global_struct"
+// NOINFO-NOT:  !DIGlobalVariable(name: "global_struct"
+
+// Static data members. Const member needs a use.
+// Also the class as a whole needs a use, so that we produce debug info for
+// the entire class (iterating over the members, demonstrably skipping those
+// with 'nodebug').
+struct S2 {
+  NODEBUG static int static_member;
+  NODEBUG static const int static_const_member = 4;
+};
+int S2::static_member = 5;
+void func3() {
+  S2 junk;
+  func1(S2::static_const_member);
+}
+// YESINFO-DAG: !DIGlobalVariable(name: "static_member"
+// NOINFO-NOT:  !DIGlobalVariable(name: "static_member"
+// YESINFO-DAG: !DIDerivedType({{.*}} name: "static_const_member"
+// NOINFO-NOT:  !DIDerivedType({{.*}} name: "static_const_member"
+
+// Function-local static and auto variables.
+void func4() {
+  NODEBUG static int static_local = 6;
+  NODEBUG        int normal_local = 7;
+}
+// YESINFO-DAG: !DIGlobalVariable(name: "static_local"
+// NOINFO-NOT:  !DIGlobalVariable(name: "static_local"
+// YESINFO-DAG: !DILocalVariable(name: "normal_local"
+// NOINFO-NOT:  !DILocalVariable(name: "normal_local"
diff --git a/test/CodeGenCXX/debug-info-ptr-to-member-function.cpp b/test/CodeGenCXX/debug-info-ptr-to-member-function.cpp
index cac16b6008907..a7e02e481320f 100644
--- a/test/CodeGenCXX/debug-info-ptr-to-member-function.cpp
+++ b/test/CodeGenCXX/debug-info-ptr-to-member-function.cpp
@@ -7,15 +7,15 @@ struct T {
 
 void foo(int (T::*method)()) {}
 
-// A pointer to a member function is a pair of function- and this-pointer.
-// CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type,
-// DARWIN-X64-SAME:      size: 128
-// WIN32-X64-SAME:       size: 64
-
 struct Incomplete;
 
 int (Incomplete::**bar)();
+// A pointer to a member function is a pair of function- and this-pointer.
 // CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type,
 // DARWIN-X64-SAME:           size: 128
 // WIN32-X64-NOT:             size:
 // CHECK-SAME:                extraData: {{.*}})
+
+// CHECK: !DIDerivedType(tag: DW_TAG_ptr_to_member_type,
+// DARWIN-X64-SAME:      size: 128
+// WIN32-X64-SAME:       size: 64
diff --git a/test/CodeGenCXX/debug-info-scoped-class.cpp b/test/CodeGenCXX/debug-info-scoped-class.cpp
new file mode 100644
index 0000000000000..de4aee9a1b448
--- /dev/null
+++ b/test/CodeGenCXX/debug-info-scoped-class.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=standalone -std=c++11 \
+// RUN:   -triple thumbv7-apple-ios %s -o - | FileCheck %s
+
+// This forward-declared scoped enum will be created while building its own
+// declcontext. Make sure it is only emitted once.
+
+struct A {
+  enum class Return;
+  Return f1();
+};
+A::Return* f2() {}
+
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Return",
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-NOT:              tag: DW_TAG_enumeration_type, name: "Return"
diff --git a/test/CodeGenCXX/debug-info-static-member.cpp b/test/CodeGenCXX/debug-info-static-member.cpp
index 8e5207d5c30b6..ed8ae015cfc3e 100644
--- a/test/CodeGenCXX/debug-info-static-member.cpp
+++ b/test/CodeGenCXX/debug-info-static-member.cpp
@@ -1,4 +1,6 @@
 // RUN: %clangxx -target x86_64-unknown-unknown -g %s -emit-llvm -S -o - | FileCheck %s
+// RUN: %clangxx -target x86_64-unknown-unknown -g -std=c++98 %s -emit-llvm -S -o - | FileCheck %s
+// RUN: %clangxx -target x86_64-unknown-unknown -g -std=c++11 %s -emit-llvm -S -o - | FileCheck %s
 // PR14471
 
 enum X {
@@ -10,7 +12,11 @@ class C
   const static bool const_a = true;
 protected:
   static int b;
+#if __cplusplus >= 201103L
+  constexpr static float const_b = 3.14;
+#else
   const static float const_b = 3.14;
+#endif
 public:
   static int c;
   const static int const_c = 18;
@@ -18,37 +24,35 @@ public:
   static X x_a;
 };
 
-int C::a = 4;
-int C::b = 2;
-int C::c = 1;
-
-int main()
-{
-        C instance_C;
-        instance_C.d = 8;
-        return C::c;
-}
-
 // The definition of C::a drives the emission of class C, which is
 // why the definition of "a" comes before the declarations while
 // "b" and "c" come after.
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "X"{{.*}}, identifier: "_ZTS1X")
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "C"{{.*}}, identifier: "_ZTS1C")
-//
-// CHECK: ![[DECL_A:[0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "a"
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "anon_static_decl_struct"
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "anon_static_decl_var"
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "static_decl_templ<int>"
+// CHECK-NOT:              DIFlagFwdDecl
+// CHECK-SAME:             ){{$}}
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "static_decl_templ_var"
+
+// CHECK: !DIGlobalVariable(name: "a", {{.*}}variable: i32* @_ZN1C1aE, declaration: ![[DECL_A:[0-9]+]])
+int C::a = 4;
+// CHECK: ![[DECL_A]] = !DIDerivedType(tag: DW_TAG_member, name: "a"
 // CHECK-NOT:                                 size:
 // CHECK-NOT:                                 align:
 // CHECK-NOT:                                 offset:
 // CHECK-SAME:                                flags: DIFlagStaticMember)
 //
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "C"{{.*}}, identifier: "_ZTS1C")
+//
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "const_a"
 // CHECK-NOT:            size:
 // CHECK-NOT:            align:
 // CHECK-NOT:            offset:
 // CHECK-SAME:           flags: DIFlagStaticMember,
 // CHECK-SAME:           extraData: i1 true)
-//
+
 // CHECK: ![[DECL_B:[0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "b"
 // CHECK-NOT:                                 size:
 // CHECK-NOT:                                 align:
@@ -61,7 +65,7 @@ int main()
 // CHECK-NOT:            offset:
 // CHECK-SAME:           flags: DIFlagProtected | DIFlagStaticMember,
 // CHECK-SAME:           extraData: float 0x{{.*}})
-//
+
 // CHECK: ![[DECL_C:[0-9]+]] = !DIDerivedType(tag: DW_TAG_member, name: "c"
 // CHECK-NOT:                                 size:
 // CHECK-NOT:                                 align:
@@ -78,12 +82,19 @@ int main()
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "x_a"
 // CHECK-SAME:           flags: DIFlagPublic | DIFlagStaticMember)
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "static_decl_templ<int>"
-// CHECK-NOT:              DIFlagFwdDecl
-// CHECK-SAME:             ){{$}}
-// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "static_decl_templ_var"
+// CHECK: !DIGlobalVariable(name: "b", {{.*}}variable: i32* @_ZN1C1bE, declaration: ![[DECL_B]])
+int C::b = 2;
+// CHECK: !DIGlobalVariable(name: "c", {{.*}}variable: i32* @_ZN1C1cE, declaration: ![[DECL_C]])
+int C::c = 1;
 
-// CHECK: [[NS_X:![0-9]+]] = !DINamespace(name: "x"
+int main()
+{
+        C instance_C;
+        instance_C.d = 8;
+        return C::c;
+}
+
+// CHECK-NOT: !DIGlobalVariable(name: "anon_static_decl_var"
 
 // Test this in an anonymous namespace to ensure the type is retained even when
 // it doesn't get automatically retained by the string type reference machinery.
@@ -94,9 +105,6 @@ struct anon_static_decl_struct {
 }
 
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "anon_static_decl_struct"
-// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "anon_static_decl_var"
-
 int ref() {
   return anon_static_decl_struct::anon_static_decl_var;
 }
@@ -113,12 +121,6 @@ int static_decl_templ_ref() {
   return static_decl_templ<int>::static_decl_templ_var;
 }
 
-// CHECK: !DIGlobalVariable(name: "a", {{.*}}variable: i32* @_ZN1C1aE, declaration: ![[DECL_A]])
-// CHECK: !DIGlobalVariable(name: "b", {{.*}}variable: i32* @_ZN1C1bE, declaration: ![[DECL_B]])
-// CHECK: !DIGlobalVariable(name: "c", {{.*}}variable: i32* @_ZN1C1cE, declaration: ![[DECL_C]])
-
-// CHECK-NOT: !DIGlobalVariable(name: "anon_static_decl_var"
-
 // Verify that even when a static member declaration is created lazily when
 // creating the definition, the declaration line is that of the canonical
 // declaration, not the definition. Also, since we look at the canonical
@@ -135,10 +137,10 @@ const int V::const_va;
 
 namespace x {
 struct y {
+// CHECK: !DIGlobalVariable(name: "z",
+// CHECK-SAME:              scope: [[NS_X:![0-9]+]]
+// CHECK: [[NS_X]] = !DINamespace(name: "x"
   static int z;
 };
 int y::z;
 }
-
-// CHECK: !DIGlobalVariable(name: "z",
-// CHECK-SAME:              scope: [[NS_X]]
diff --git a/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp b/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
index 04c63ae2ff242..08146c25263b9 100644
--- a/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
+++ b/test/CodeGenCXX/debug-info-template-explicit-specialization.cpp
@@ -6,6 +6,11 @@
 
 // LINES-ONLY-NOT: !DICompositeType(tag: DW_TAG_structure_type
 
+// "h" is at the top because it's in the compile unit's retainedTypes: list.
+// CHECK: DICompositeType(tag: DW_TAG_structure_type, name: "h<int>"
+// CHECK-NOT: DIFlagFwdDecl
+// CHECK-SAME: ){{$}}
+
 template <typename T>
 struct a {
 };
@@ -85,9 +90,6 @@ template <typename T>
 struct h {
 };
 template class h<int>;
-// CHECK: DICompositeType(tag: DW_TAG_structure_type, name: "h<int>"
-// CHECK-NOT: DIFlagFwdDecl
-// CHECK-SAME: ){{$}}
 
 template <typename T>
 struct i {
diff --git a/test/CodeGenCXX/debug-info-template-limit.cpp b/test/CodeGenCXX/debug-info-template-limit.cpp
index 5c4ac0cc3e153..172ab94dce624 100644
--- a/test/CodeGenCXX/debug-info-template-limit.cpp
+++ b/test/CodeGenCXX/debug-info-template-limit.cpp
@@ -1,8 +1,8 @@
 // RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -triple %itanium_abi_triple %s -o - | FileCheck %s
 
 // Check that this pointer type is TC<int>
-// CHECK: ![[LINE:[0-9]+]] = !DICompositeType(tag: DW_TAG_class_type, name: "TC<int>"{{.*}}, identifier: "_ZTS2TCIiE")
-// CHECK: !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !"_ZTS2TCIiE"
+// CHECK: ![[LINE:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_class_type, name: "TC<int>"{{.*}}, identifier: "_ZTS2TCIiE")
+// CHECK: !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[LINE]]
 
 template<typename T>
 class TC {
diff --git a/test/CodeGenCXX/debug-info-template-member.cpp b/test/CodeGenCXX/debug-info-template-member.cpp
index b94ff05df44e6..88f024b59abf7 100644
--- a/test/CodeGenCXX/debug-info-template-member.cpp
+++ b/test/CodeGenCXX/debug-info-template-member.cpp
@@ -16,6 +16,12 @@ inline int add3(int x) {
   return MyClass().add<3>(x); // even though add<3> is ODR used, don't emit it since we don't codegen it
 }
 
+// The compile unit pulls in the global variables first.
+// CHECK: !DIGlobalVariable(name: "x",
+// CHECK-SAME:              type: ![[OUTER_FOO_INNER_ID:[0-9]+]]
+// CHECK-SAME:              variable: %"struct.outer<foo>::inner"* @x
+
+// CHECK: ![[OUTER_FOO_INNER_ID:[0-9]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "inner"{{.*}}, identifier:
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
 // CHECK-SAME:             elements: [[FOO_MEM:![0-9]*]]
 // CHECK-SAME:             identifier: "_ZTS3foo"
@@ -23,42 +29,35 @@ inline int add3(int x) {
 // CHECK: [[FOO_FUNC]] = !DISubprogram(name: "func", linkageName: "_ZN3foo4funcEN5outerIS_E5innerE",
 // CHECK-SAME:                         type: [[FOO_FUNC_TYPE:![0-9]*]]
 // CHECK: [[FOO_FUNC_TYPE]] = !DISubroutineType(types: [[FOO_FUNC_PARAMS:![0-9]*]])
-// CHECK: [[FOO_FUNC_PARAMS]] = !{null, !{{[0-9]*}}, !"[[OUTER_FOO_INNER_ID:.*]]"}
-// CHECK: !{{[0-9]*}} = !DICompositeType(tag: DW_TAG_structure_type, name: "inner"{{.*}}, identifier: "[[OUTER_FOO_INNER_ID]]")
-
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "virt<elem>"
-// CHECK-SAME:             elements: [[VIRT_MEM:![0-9]*]]
-// CHECK-SAME:             vtableHolder: !"_ZTS4virtI4elemE"
-// CHECK-SAME:             templateParams: [[VIRT_TEMP_PARAM:![0-9]*]]
-// CHECK-SAME:             identifier: "_ZTS4virtI4elemE"
-// CHECK: [[VIRT_TEMP_PARAM]] = !{[[VIRT_T:![0-9]*]]}
-// CHECK: [[VIRT_T]] = !DITemplateTypeParameter(name: "T", type: !"_ZTS4elem")
+// CHECK: [[FOO_FUNC_PARAMS]] = !{null, !{{[0-9]*}}, ![[OUTER_FOO_INNER_ID]]}
 
-// CHECK: [[C:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "MyClass"
+// CHECK: [[C:![0-9]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MyClass"
 // CHECK-SAME:                             elements: [[C_MEM:![0-9]*]]
-// CHECK-SAME:                             vtableHolder: !"_ZTS7MyClass"
+// CHECK-SAME:                             vtableHolder: [[C]]
 // CHECK-SAME:                             identifier: "_ZTS7MyClass")
 // CHECK: [[C_MEM]] = !{[[C_VPTR:![0-9]*]], [[C_FUNC:![0-9]*]]}
 // CHECK: [[C_VPTR]] = !DIDerivedType(tag: DW_TAG_member, name: "_vptr$MyClass"
 
 // CHECK: [[C_FUNC]] = !DISubprogram(name: "func",{{.*}} line: 7,
 
-// CHECK: [[ELEM:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "elem"
+// CHECK: !DISubprogram(name: "add<2>"
+// CHECK-SAME:          scope: [[C]]
+//
+// CHECK: [[VIRT_TEMP:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "virt<elem>"
+// CHECK-SAME:             elements: [[VIRT_MEM:![0-9]*]]
+// CHECK-SAME:             vtableHolder: [[VIRT_TEMP]]
+// CHECK-SAME:             templateParams: [[VIRT_TEMP_PARAM:![0-9]*]]
+// CHECK-SAME:             identifier: "_ZTS4virtI4elemE"
+
+// CHECK: [[ELEM:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "elem"
 // CHECK-SAME:                                elements: [[ELEM_MEM:![0-9]*]]
 // CHECK-SAME:                                identifier: "_ZTS4elem"
 // CHECK: [[ELEM_MEM]] = !{[[ELEM_X:![0-9]*]]}
-// CHECK: [[ELEM_X]] = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !"_ZTS4elem"
-// CHECK-SAME:                        baseType: !"_ZTS4virtI4elemE"
+// CHECK: [[ELEM_X]] = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: [[ELEM]]
+// CHECK-SAME:                        baseType: [[VIRT_TEMP:![0-9]+]]
 
-// Check that the member function template specialization and implicit special
-// members (the default ctor) refer to their class by scope, even though they
-// didn't appear in the class's member list (C_MEM). This prevents the functions
-// from being added to type units, while still appearing in the type
-// declaration/reference in the compile unit.
-// CHECK: !DISubprogram(name: "MyClass"
-// CHECK-SAME:          scope: !"_ZTS7MyClass"
-// CHECK: !DISubprogram(name: "add<2>"
-// CHECK-SAME:          scope: !"_ZTS7MyClass"
+// CHECK: [[VIRT_TEMP_PARAM]] = !{[[VIRT_T:![0-9]*]]}
+// CHECK: [[VIRT_T]] = !DITemplateTypeParameter(name: "T", type: [[ELEM]])
 
 template<typename T>
 struct outer {
@@ -80,10 +79,6 @@ inline void func() {
 
 outer<foo>::inner x;
 
-// CHECK: !DIGlobalVariable(name: "x",
-// CHECK-SAME:              type: !"[[OUTER_FOO_INNER_ID]]"
-// CHECK-SAME:              variable: %"struct.outer<foo>::inner"* @x
-
 template <typename T>
 struct virt {
   T* values;
@@ -98,3 +93,11 @@ inline void f1() {
 void f2() {
   virt<elem> d; // emit 'virt<elem>'
 }
+
+// Check that the member function template specialization and implicit special
+// members (the default ctor) refer to their class by scope, even though they
+// didn't appear in the class's member list (C_MEM). This prevents the functions
+// from being added to type units, while still appearing in the type
+// declaration/reference in the compile unit.
+// CHECK: !DISubprogram(name: "MyClass"
+// CHECK-SAME:          scope: [[C]]
diff --git a/test/CodeGenCXX/debug-info-template-quals.cpp b/test/CodeGenCXX/debug-info-template-quals.cpp
index 1e8bdb1ad7145..7a0d0d4a27552 100644
--- a/test/CodeGenCXX/debug-info-template-quals.cpp
+++ b/test/CodeGenCXX/debug-info-template-quals.cpp
@@ -15,17 +15,17 @@ void foo (const char *c) {
   str.assign(c, str);
 }
 
-// CHECK: [[BS:.*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "basic_string<char>"
+// CHECK: [[P:![0-9]*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[CON:![0-9]*]]
+// CHECK: [[CON]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[CH:![0-9]*]]
+// CHECK: [[CH]] = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+// CHECK: [[BS:.*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "basic_string<char>"
 // CHECK-SAME:                         line: 4
 // CHECK-SAME:                         size: 8, align: 8
 // CHECK: [[TYPE:![0-9]*]] = !DISubroutineType(types: [[ARGS:.*]])
-// CHECK: [[ARGS]] = !{!{{.*}}, !{{.*}}, [[P:![0-9]*]], [[R:.*]]}
-// CHECK: [[P]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[CON:![0-9]*]]
-// CHECK: [[CON]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[CH:![0-9]*]]
-// CHECK: [[CH]] = !DIBasicType(name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
+// CHECK: [[ARGS]] = !{!{{.*}}, !{{.*}}, [[P]], [[R:.*]]}
 
 // CHECK: [[R]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[CON2:![0-9]*]]
-// CHECK: [[CON2]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: !"_ZTS12basic_stringIcE"
+// CHECK: [[CON2]] = !DIDerivedType(tag: DW_TAG_const_type, baseType: [[BS]]
 // CHECK: !DISubprogram(name: "assign"
 // CHECK-SAME:          line: 7
 // CHECK-SAME:          scopeLine: 8
diff --git a/test/CodeGenCXX/debug-info-template.cpp b/test/CodeGenCXX/debug-info-template.cpp
index 74adef9a5f7aa..d35bb70fa713e 100644
--- a/test/CodeGenCXX/debug-info-template.cpp
+++ b/test/CodeGenCXX/debug-info-template.cpp
@@ -1,73 +1,105 @@
 // RUN: %clang -S -emit-llvm -target x86_64-unknown_unknown -g %s -o - -std=c++11 | FileCheck %s
 
 // CHECK: !DICompileUnit(
-// CHECK-SAME:           retainedTypes: [[RETAIN:![0-9]*]]
 // CHECK: [[EMPTY:![0-9]*]] = !{}
-// CHECK: [[RETAIN]] = !{!{{[0-9]]*}}, [[FOO:![0-9]*]],
 
+struct foo {
+  char pad[8]; // make the member pointer to 'e' a bit more interesting (nonzero)
+  int e;
+  void f();
+  static void g();
+};
+
+typedef int foo::*foo_mem;
+
+template<typename T, T, const int *x, foo_mem a, void (foo::*b)(), void (*f)(), int ...Is>
+struct TC {
+  struct nested {
+  };
+};
+
+// CHECK: [[INT:![0-9]+]] = !DIBasicType(name: "int"
+int glb;
+void func();
+
+// CHECK: !DIGlobalVariable(name: "tci",
+// CHECK-SAME:              type: ![[TCNESTED:[0-9]+]]
+// CHECK-SAME:              variable: %"struct.TC<unsigned int, 2, &glb, &foo::e, &foo::f, &foo::g, 1, 2, 3>::nested"* @tci
+// CHECK: ![[TCNESTED]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "nested",
+// CHECK-SAME:             scope: ![[TC:[0-9]+]],
 
-// CHECK: [[TC:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "TC<unsigned int, 2, &glb, &foo::e, &foo::f, &foo::g, 1, 2, 3>"
+// CHECK: ![[TC]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TC<unsigned int, 2, &glb, &foo::e, &foo::f, &foo::g, 1, 2, 3>"
 // CHECK-SAME:                              templateParams: [[TCARGS:![0-9]*]]
+TC
 // CHECK: [[TCARGS]] = !{[[TCARG1:![0-9]*]], [[TCARG2:![0-9]*]], [[TCARG3:![0-9]*]], [[TCARG4:![0-9]*]], [[TCARG5:![0-9]*]], [[TCARG6:![0-9]*]], [[TCARG7:![0-9]*]]}
-//
 // CHECK: [[TCARG1]] = !DITemplateTypeParameter(name: "T", type: [[UINT:![0-9]*]])
 // CHECK: [[UINT:![0-9]*]] = !DIBasicType(name: "unsigned int"
+< unsigned,
 // CHECK: [[TCARG2]] = !DITemplateValueParameter(type: [[UINT]], value: i32 2)
+  2,
 // CHECK: [[TCARG3]] = !DITemplateValueParameter(name: "x", type: [[CINTPTR:![0-9]*]], value: i32* @glb)
 // CHECK: [[CINTPTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, {{.*}}baseType: [[CINT:![0-9]+]]
-// CHECK: [[CINT]] = !DIDerivedType(tag: DW_TAG_const_type, {{.*}}baseType: [[INT:![0-9]+]]
-// CHECK: [[INT]] = !DIBasicType(name: "int"
+// CHECK: [[CINT]] = !DIDerivedType(tag: DW_TAG_const_type, {{.*}}baseType: [[INT]]
+  &glb,
 // CHECK: [[TCARG4]] = !DITemplateValueParameter(name: "a", type: [[MEMINTPTR:![0-9]*]], value: i64 8)
-// CHECK: [[MEMINTPTR]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, {{.*}}baseType: [[INT]], {{.*}}extraData: !"_ZTS3foo")
+// CHECK: [[MEMINTPTR]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, {{.*}}baseType: [[INT]], {{.*}}extraData: ![[FOO:[0-9]+]])
+//
+// We could just emit a declaration of 'foo' here, rather than the entire
+// definition (same goes for any time we emit a member (function or data)
+// pointer type)
+// CHECK: [[FOO]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", {{.*}}identifier: "_ZTS3foo")
+// CHECK: !DISubprogram(name: "f", linkageName: "_ZN3foo1fEv", {{.*}}type: [[FTYPE:![0-9]*]]
 //
 // Currently Clang emits the pointer-to-member-function value, but LLVM doesn't
 // use it (GCC doesn't emit a value for pointers to member functions either - so
 // it's not clear what, if any, format would be acceptable to GDB)
 //
-// CHECK: [[TCARG5]] = !DITemplateValueParameter(name: "b", type: [[MEMFUNPTR:![0-9]*]], value: { i64, i64 } { i64 ptrtoint (void (%struct.foo*)* @_ZN3foo1fEv to i64), i64 0 })
-// CHECK: [[MEMFUNPTR]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, {{.*}}baseType: [[FTYPE:![0-9]*]], {{.*}}extraData: !"_ZTS3foo")
-// CHECK: [[FTYPE]] = !DISubroutineType(types: [[FARGS:![0-9]*]])
+// CHECK: [[FTYPE:![0-9]*]] = !DISubroutineType(types: [[FARGS:![0-9]*]])
 // CHECK: [[FARGS]] = !{null, [[FARG1:![0-9]*]]}
 // CHECK: [[FARG1]] = !DIDerivedType(tag: DW_TAG_pointer_type,
-// CHECK-SAME:                       baseType: !"_ZTS3foo"
+// CHECK-SAME:                       baseType: ![[FOO]]
 // CHECK-NOT:                        line:
 // CHECK-SAME:                       size: 64, align: 64
 // CHECK-NOT:                        offset: 0
 // CHECK-SAME:                       DIFlagArtificial
-//
-// CHECK: [[TCARG6]] = !DITemplateValueParameter(name: "f", type: [[FUNPTR:![0-9]*]], value: void ()* @_ZN3foo1gEv)
-// CHECK: [[FUNPTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[FUNTYPE:![0-9]*]]
-// CHECK: [[FUNTYPE]] = !DISubroutineType(types: [[FUNARGS:![0-9]*]])
+// CHECK: [[FUNTYPE:![0-9]*]] = !DISubroutineType(types: [[FUNARGS:![0-9]*]])
 // CHECK: [[FUNARGS]] = !{null}
+  &foo::e,
+// CHECK: [[TCARG5]] = !DITemplateValueParameter(name: "b", type: [[MEMFUNPTR:![0-9]*]], value: { i64, i64 } { i64 ptrtoint (void (%struct.foo*)* @_ZN3foo1fEv to i64), i64 0 })
+// CHECK: [[MEMFUNPTR]] = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, {{.*}}baseType: [[FTYPE]], {{.*}}extraData: ![[FOO]])
+  &foo::f,
+// CHECK: [[TCARG6]] = !DITemplateValueParameter(name: "f", type: [[FUNPTR:![0-9]*]], value: void ()* @_ZN3foo1gEv)
+// CHECK: [[FUNPTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[FUNTYPE]]
+  &foo::g,
 // CHECK: [[TCARG7]] = !DITemplateValueParameter(tag: DW_TAG_GNU_template_parameter_pack, name: "Is", value: [[TCARG7_VALS:![0-9]*]])
 // CHECK: [[TCARG7_VALS]] = !{[[TCARG7_1:![0-9]*]], [[TCARG7_2:![0-9]*]], [[TCARG7_3:![0-9]*]]}
 // CHECK: [[TCARG7_1]] = !DITemplateValueParameter(type: [[INT]], value: i32 1)
+  1,
 // CHECK: [[TCARG7_2]] = !DITemplateValueParameter(type: [[INT]], value: i32 2)
+  2,
 // CHECK: [[TCARG7_3]] = !DITemplateValueParameter(type: [[INT]], value: i32 3)
-//
-// We could just emit a declaration of 'foo' here, rather than the entire
-// definition (same goes for any time we emit a member (function or data)
-// pointer type)
-// CHECK: [[FOO]] = !DICompositeType(tag: DW_TAG_structure_type, name: "foo", {{.*}}identifier: "_ZTS3foo")
-// CHECK: !DISubprogram(name: "f", linkageName: "_ZN3foo1fEv", {{.*}}type: [[FTYPE:![0-9]*]]
-//
+  3>::nested tci;
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "nested",
-// CHECK-SAME:             scope: !"_ZTS2TCIjLj2EXadL_Z3glbEEXadL_ZN3foo1eEEEXadL_ZNS0_1fEvEEXadL_ZNS0_1gEvEEJLi1ELi2ELi3EEE"
-// CHECK-SAME:             identifier: "[[TCNESTED:.*]]")
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "TC<int, -3, nullptr, nullptr, nullptr, nullptr>"
+// CHECK: !DIGlobalVariable(name: "tcn"
+// CHECK-SAME:              type: ![[TCNT:[0-9]+]]
+// CHECK-SAME:              variable: %struct.TC* @tcn
+TC
+// CHECK: ![[TCNT]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "TC<int, -3, nullptr, nullptr, nullptr, nullptr>"
 // CHECK-SAME:             templateParams: [[TCNARGS:![0-9]*]]
-// CHECK-SAME:             identifier: "[[TCNT:.*]]")
 // CHECK: [[TCNARGS]] = !{[[TCNARG1:![0-9]*]], [[TCNARG2:![0-9]*]], [[TCNARG3:![0-9]*]], [[TCNARG4:![0-9]*]], [[TCNARG5:![0-9]*]], [[TCNARG6:![0-9]*]], [[TCNARG7:![0-9]*]]}
 // CHECK: [[TCNARG1]] = !DITemplateTypeParameter(name: "T", type: [[INT]])
+<int,
 // CHECK: [[TCNARG2]] = !DITemplateValueParameter(type: [[INT]], value: i32 -3)
+  -3,
 // CHECK: [[TCNARG3]] = !DITemplateValueParameter(name: "x", type: [[CINTPTR]], value: i8 0)
+  nullptr,
 
 // The interesting null pointer: -1 for member data pointers (since they are
 // just an offset in an object, they can be zero and non-null for the first
 // member)
 
 // CHECK: [[TCNARG4]] = !DITemplateValueParameter(name: "a", type: [[MEMINTPTR]], value: i64 -1)
+  nullptr,
 //
 // In some future iteration we could possibly emit the value of a null member
 // function pointer as '{ i64, i64 } zeroinitializer' as it may be handled
@@ -75,69 +107,38 @@
 // member function pointers. For now, it's simpler just to emit the 'i8 0'.
 //
 // CHECK: [[TCNARG5]] = !DITemplateValueParameter(name: "b", type: [[MEMFUNPTR]], value: i8 0)
+  nullptr,
 // CHECK: [[TCNARG6]] = !DITemplateValueParameter(name: "f", type: [[FUNPTR]], value: i8 0)
+  nullptr
 // CHECK: [[TCNARG7]] = !DITemplateValueParameter(tag: DW_TAG_GNU_template_parameter_pack, name: "Is", value: [[EMPTY]])
+  > tcn;
+
+template<typename>
+struct tmpl_impl {
+};
+
+template <template <typename> class tmpl, int &lvr, int &&rvr>
+struct NN {
+};
+
+// CHECK: !DIGlobalVariable(name: "nn"
+// CHECK-SAME:              type: ![[NNT:[0-9]+]]
+// CHECK-SAME:              variable: %struct.NN* @nn
 
 // FIXME: these parameters should probably be rendered as 'glb' rather than
 // '&glb', since they're references, not pointers.
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "NN<tmpl_impl, &glb, &glb>",
+// CHECK: ![[NNT]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "NN<tmpl_impl, &glb, &glb>",
 // CHECK-SAME:             templateParams: [[NNARGS:![0-9]*]]
-// CHECK-SAME:             identifier: "[[NNT:.*]]")
+// CHECK-SAME:             identifier:
 // CHECK: [[NNARGS]] = !{[[NNARG1:![0-9]*]], [[NNARG2:![0-9]*]], [[NNARG3:![0-9]*]]}
 // CHECK: [[NNARG1]] = !DITemplateValueParameter(tag: DW_TAG_GNU_template_template_param, name: "tmpl", value: !"tmpl_impl")
 // CHECK: [[NNARG2]] = !DITemplateValueParameter(name: "lvr", type: [[INTLVR:![0-9]*]], value: i32* @glb)
 // CHECK: [[INTLVR]] = !DIDerivedType(tag: DW_TAG_reference_type, baseType: [[INT]]
 // CHECK: [[NNARG3]] = !DITemplateValueParameter(name: "rvr", type: [[INTRVR:![0-9]*]], value: i32* @glb)
 // CHECK: [[INTRVR]] = !DIDerivedType(tag: DW_TAG_rvalue_reference_type, baseType: [[INT]]
-
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "PaddingAtEndTemplate<&PaddedObj>"
-// CHECK-SAME:             templateParams: [[PTOARGS:![0-9]*]]
-// CHECK: [[PTOARGS]] = !{[[PTOARG1:![0-9]*]]}
-// CHECK: [[PTOARG1]] = !DITemplateValueParameter(type: [[CONST_PADDINGATEND_PTR:![0-9]*]], value: %struct.PaddingAtEnd* @PaddedObj)
-// CHECK: [[CONST_PADDINGATEND_PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !"_ZTS12PaddingAtEnd", size: 64, align: 64)
-
-// CHECK: !DIGlobalVariable(name: "tci",
-// CHECK-SAME:              type: !"[[TCNESTED]]"
-// CHECK-SAME:              variable: %"struct.TC<unsigned int, 2, &glb, &foo::e, &foo::f, &foo::g, 1, 2, 3>::nested"* @tci
-
-// CHECK: !DIGlobalVariable(name: "tcn"
-// CHECK-SAME:              type: !"[[TCNT]]"
-// CHECK-SAME:              variable: %struct.TC* @tcn
-
-// CHECK: !DIGlobalVariable(name: "nn"
-// CHECK-SAME:              type: !"[[NNT]]"
-// CHECK-SAME:              variable: %struct.NN* @nn
-struct foo {
-  char pad[8]; // make the member pointer to 'e' a bit more interesting (nonzero)
-  int e;
-  void f();
-  static void g();
-};
-
-typedef int foo::*foo_mem;
-
-template<typename T, T, const int *x, foo_mem a, void (foo::*b)(), void (*f)(), int ...Is>
-struct TC {
-  struct nested {
-  };
-};
-
-int glb;
-void func();
-
-TC<unsigned, 2, &glb, &foo::e, &foo::f, &foo::g, 1, 2, 3>::nested tci;
-TC<int, -3, nullptr, nullptr, nullptr, nullptr> tcn;
-
-template<typename>
-struct tmpl_impl {
-};
-
-template <template <typename> class tmpl, int &lvr, int &&rvr>
-struct NN {
-};
-
 NN<tmpl_impl, glb, glb> nn;
 
+// CHECK: ![[PADDINGATEND:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "PaddingAtEnd",
 struct PaddingAtEnd {
   int i;
   char c;
@@ -145,6 +146,11 @@ struct PaddingAtEnd {
 
 PaddingAtEnd PaddedObj = {};
 
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "PaddingAtEndTemplate<&PaddedObj>"
+// CHECK-SAME:             templateParams: [[PTOARGS:![0-9]*]]
+// CHECK: [[PTOARGS]] = !{[[PTOARG1:![0-9]*]]}
+// CHECK: [[PTOARG1]] = !DITemplateValueParameter(type: [[CONST_PADDINGATEND_PTR:![0-9]*]], value: %struct.PaddingAtEnd* @PaddedObj)
+// CHECK: [[CONST_PADDINGATEND_PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[PADDINGATEND]], size: 64, align: 64)
 template <PaddingAtEnd *>
 struct PaddingAtEndTemplate {
 };
diff --git a/test/CodeGenCXX/debug-info-varargs.cpp b/test/CodeGenCXX/debug-info-varargs.cpp
index 52bffe6c92858..7afbcd255842e 100644
--- a/test/CodeGenCXX/debug-info-varargs.cpp
+++ b/test/CodeGenCXX/debug-info-varargs.cpp
@@ -2,13 +2,7 @@
 
 struct A
 {
-  // CHECK: !DISubprogram(name: "a", linkageName: "_ZN1A1aEiz"
-  // CHECK-SAME:          line: [[@LINE+2]]
-  // CHECK-SAME:          type: ![[ATY:[0-9]+]]
   void a(int c, ...) {}
-  // CHECK: ![[ATY]] = !DISubroutineType(types: ![[AARGS:[0-9]+]])
-  // We no longer use an explicit unspecified parameter. Instead we use a trailing null to mean the function is variadic.
-  // CHECK: ![[AARGS]] = !{null, !{{[0-9]+}}, !{{[0-9]+}}, null}
 };
 
   // CHECK: !DISubprogram(name: "b", linkageName: "_Z1biz"
@@ -18,6 +12,14 @@ void b(int c, ...) {
   // CHECK: ![[BTY]] = !DISubroutineType(types: ![[BARGS:[0-9]+]])
   // CHECK: ![[BARGS]] = !{null, !{{[0-9]+}}, null}
 
+  // The subprogram "a" comes after "b" because the function comes later.
+  // CHECK: !DISubprogram(name: "a", linkageName: "_ZN1A1aEiz"
+  // CHECK-SAME:          line: 5,
+  // CHECK-SAME:          type: ![[ATY:[0-9]+]]
+  // CHECK: ![[ATY]] = !DISubroutineType(types: ![[AARGS:[0-9]+]])
+  // We no longer use an explicit unspecified parameter. Instead we use a trailing null to mean the function is variadic.
+  // CHECK: ![[AARGS]] = !{null, !{{[0-9]+}}, !{{[0-9]+}}, null}
+
   A a;
 
   // CHECK: !DILocalVariable(name: "fptr"
diff --git a/test/CodeGenCXX/debug-info.cpp b/test/CodeGenCXX/debug-info.cpp
index 29ed9e93156d6..2e8d5fd5f1379 100644
--- a/test/CodeGenCXX/debug-info.cpp
+++ b/test/CodeGenCXX/debug-info.cpp
@@ -1,5 +1,24 @@
-// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s
-// RUN: %clang_cc1 -triple i686-pc-windows-msvc -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=MSVC
+// RUN: %clang_cc1 -triple x86_64-none-linux-gnu -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=BOTH
+// RUN: %clang_cc1 -triple i686-pc-windows-msvc -emit-llvm -debug-info-kind=limited %s -o - | FileCheck %s --check-prefix=MSVC --check-prefix=BOTH
+
+// CHECK: define void @_ZN7pr147634funcENS_3fooE
+// CHECK: call void @llvm.dbg.declare({{.*}}, metadata ![[F:[0-9]+]], metadata ![[EXPR:[0-9]+]])
+
+// !llvm.dbg.cu pulls in globals and their types first.
+// CHECK-NOT: !DIGlobalVariable(name: "c"
+// CHECK: !DIGlobalVariable(name: "x", linkageName: "_ZN6pr96081xE"
+// CHECK-SAME:              type: [[INCARRAYPTR:![0-9]*]]
+// CHECK-SAME:              variable: [3 x i8]** @_ZN6pr96081xE
+// CHECK: [[INCARRAYPTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[INCARRAY:![0-9]+]]
+// CHECK: [[INCARRAY]] = !DICompositeType(tag: DW_TAG_array_type
+// CHECK-NOT:                             line:
+// CHECK-NOT:                             size:
+// CHECK-NOT:                             align:
+// CHECK-NOT:                             offset:
+// CHECK-SAME:                            baseType: ![[INCTYPE:[0-9]+]]
+
+// CHECK: ![[INCTYPE]] = !DICompositeType(tag: DW_TAG_structure_type, name: "incomplete"
+// CHECK-SAME:                                   DIFlagFwdDecl
 
 template<typename T> struct Identity {
   typedef T Type;
@@ -47,38 +66,27 @@ namespace VirtualDtor {
 namespace VirtualBase {
   struct A { int a; };
   struct B : virtual A { int b; };
+// BOTH: ![[VBASE_B:[0-9]+]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "B",{{.*}} line: [[@LINE-1]],
+// MSVC-SAME:                                        size: 96, align: 32
+// CHECK-SAME:                                       size: 128, align: 64,
+// BOTH-NOT:                                         offset:
+// BOTH-NOT:                                         DIFlagFwdDecl
+// BOTH-SAME:                                        elements: [[VBASE_B_DEF:![0-9]+]]
+// BOTH: [[VBASE_B_DEF]] = !{[[VBASE_A_IN_B:![0-9]+]],
+//
+// Look for the vbtable offset of A, which should be 4 for MSVC, 24 otherwise.
+// BOTH: [[VBASE_A_IN_B]] = !DIDerivedType(tag: DW_TAG_inheritance, scope: ![[VBASE_B]],
+// BOTH-SAME:                              baseType: ![[VBASE_A:[0-9]+]],
+// MSVC-SAME:                              offset: 4,
+// CHECK-SAME:                             offset: 24,
+//
+// BOTH: ![[VBASE_A]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "A",
 
   void f() {
     B b;
   }
 }
 
-// CHECK: define void @_ZN7pr147634funcENS_3fooE
-// CHECK: call void @llvm.dbg.declare({{.*}}, metadata ![[F:.*]], metadata ![[EXPR:.*]])
-
-// MSVC: [[VBASE_B:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "B",{{.*}} line: 49
-// MSVC-SAME:                                            size: 96, align: 32
-// MSVC-NOT:                                             offset:
-// MSVC-NOT:                                             DIFlagFwdDecl
-// MSVC-SAME:                                            elements: [[VBASE_B_DEF:![0-9]+]]
-// MSVC: [[VBASE_B_DEF]] = !{[[VBASE_A_IN_B:![0-9]+]],
-//
-// Look for the vbtable offset of A, which should be 4.
-// MSVC: [[VBASE_A_IN_B]] = !DIDerivedType(tag: DW_TAG_inheritance, scope: [[VBASE_B]],
-// MSVC-SAME:                              baseType: !{{[0-9]*}}
-
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "B",{{.*}} line: 49,
-// CHECK-SAME:             size: 128, align: 64,
-// CHECK-NOT:              offset:
-// CHECK-NOT:              DIFlagFwdDecl
-// CHECK-SAME:             elements: [[VBASE_B_DEF:![^,)]+]]
-// CHECK: [[VBASE_B_DEF]] = !{[[VBASE_A_IN_B:![0-9]+]],
-//
-// Look for the vtable offset offset, which should be -24.
-// CHECK: [[VBASE_A_IN_B]] = !DIDerivedType(tag: DW_TAG_inheritance
-// CHECK-SAME:                              scope: !"_ZTSN11VirtualBase1BE"
-// CHECK-SAME:                              baseType: !"_ZTSN11VirtualBase1AE"
-// CHECK-SAME:                              offset: 24,
 namespace b5249287 {
 template <typename T> class A {
   struct B;
@@ -91,72 +99,56 @@ class Cls {
 Cls obj;
 }
 
+// CHECK: [[FUNC:[0-9]+]] = distinct !DISubprogram(name: "func", linkageName: "_ZN7pr147634funcENS_3fooE"
+// CHECK-SAME:                                      type: {{![0-9]+}}
+// CHECK-SAME:                                      isDefinition: true
+
+// CHECK: [[PR14763:![0-9]+]] = !DINamespace(name: "pr14763"
 namespace pr14763 {
 struct foo {
+// CHECK: ![[FOO:[0-9]+]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "foo"
+// CHECK-SAME:             scope: [[PR14763]]
+// CHECK-SAME:             identifier:
   foo(const foo&);
 };
 
+// For some reason function arguments ended up down here
+// CHECK: ![[F]] = !DILocalVariable(name: "f", arg: 1, scope: ![[FUNC]]
+// CHECK-SAME:                      type: ![[FOO]]
+// CHECK: ![[EXPR]] = !DIExpression(DW_OP_deref)
 foo func(foo f) {
   return f; // reference 'f' for now because otherwise we hit another bug
 }
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
-// CHECK-SAME:             scope: [[PR14763:![0-9]+]]
-// CHECK-SAME:             identifier: "[[FOO:.*]]"
-// CHECK: [[PR14763]] = !DINamespace(name: "pr14763"
-// CHECK: [[INCTYPE:![0-9]*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "incomplete"
-// CHECK-SAME:                                   DIFlagFwdDecl
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "a"
-// CHECK-SAME:             elements: [[A_MEM:![0-9]+]]
-// CHECK-SAME:             identifier: "_ZTSN7pr162141aE"
-// CHECK: [[A_MEM]] = !{[[A_I:![0-9]*]]}
-// CHECK: [[A_I]] = !DIDerivedType(tag: DW_TAG_member, name: "i"
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "b"
-// CHECK-SAME:             DIFlagFwdDecl
-
-// CHECK: [[FUNC:![0-9]+]] = distinct !DISubprogram(name: "func", linkageName: "_ZN7pr147634funcENS_3fooE"
-// CHECK-SAME:                                      type: [[FUNC_TYPE:![0-9]*]]
-// CHECK-SAME:                                      isDefinition: true
 }
 
 void foo() {
+// CHECK: !DILocalVariable(name: "c"
+// CHECK-NOT:              arg:
+// CHECK-SAME:            )
   const wchar_t c = L'x';
   wchar_t d = c;
 }
 
-// CHECK-NOT: !DIGlobalVariable(name: "c"
-
 namespace pr9608 { // also pr9600
 struct incomplete;
 incomplete (*x)[3];
-// CHECK: !DIGlobalVariable(name: "x", linkageName: "_ZN6pr96081xE"
-// CHECK-SAME:              type: [[INCARRAYPTR:![0-9]*]]
-// CHECK-SAME:              variable: [3 x i8]** @_ZN6pr96081xE
-// CHECK: [[INCARRAYPTR]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[INCARRAY:![0-9]+]]
-// CHECK: [[INCARRAY]] = !DICompositeType(tag: DW_TAG_array_type
-// CHECK-NOT:                             line:
-// CHECK-NOT:                             size:
-// CHECK-NOT:                             align:
-// CHECK-NOT:                             offset:
-// CHECK-SAME:                            baseType: !"_ZTSN6pr960810incompleteE"
 }
 
-// For some reason function arguments ended up down here
-// CHECK: ![[F]] = !DILocalVariable(name: "f", arg: 1, scope: [[FUNC]]
-// CHECK-SAME:                      type: !"[[FOO]]"
-// CHECK: ![[EXPR]] = !DIExpression(DW_OP_deref)
-
-// CHECK: !DILocalVariable(name: "c"
-// CHECK-NOT:              arg:
-// CHECK-SAME:            )
-
 namespace pr16214 {
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "a"
+// CHECK-SAME:             elements: [[A_MEM:![0-9]+]]
+// CHECK-SAME:             identifier: "_ZTSN7pr162141aE"
+// CHECK: [[A_MEM]] = !{[[A_I:![0-9]*]]}
 struct a {
+// CHECK: [[A_I]] = !DIDerivedType(tag: DW_TAG_member, name: "i"
   int i;
 };
 
 typedef a at;
 
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "b"
+// CHECK-SAME:             DIFlagFwdDecl
 struct b {
 };
 
diff --git a/test/CodeGenCXX/debug-lambda-expressions.cpp b/test/CodeGenCXX/debug-lambda-expressions.cpp
index a022fad1b6b92..b01f770bdd970 100644
--- a/test/CodeGenCXX/debug-lambda-expressions.cpp
+++ b/test/CodeGenCXX/debug-lambda-expressions.cpp
@@ -17,23 +17,31 @@ int d(int x) { D y[10]; return [x,y] { return y[x].x; }(); }
 // Randomness for file. -- 6
 // CHECK: [[FILE:.*]] = !DIFile(filename: "{{.*}}debug-lambda-expressions.cpp",
 
+// CVAR:
+// CHECK: !DIGlobalVariable(name: "cvar"
+// CHECK-SAME:              line: [[CVAR_LINE:[0-9]+]]
+// CHECK-SAME:              type: ![[CVAR_T:[0-9]+]]
+// CHECK: ![[CVAR_T]] = distinct !DICompositeType(tag: DW_TAG_class_type
+// CHECK-SAME:                           line: [[CVAR_LINE]],
+// CHECK-SAME:                           elements: ![[CVAR_ARGS:[0-9]+]]
+// CHECK: ![[CVAR_ARGS]] = !{!{{[0-9]+}}}
+
+// VAR:
+// CHECK: !DIGlobalVariable(name: "var"
+// CHECK-SAME:              line: [[VAR_LINE:[0-9]+]]
+// CHECK-SAME:              type: ![[VAR_T:[0-9]+]]
+// CHECK: ![[VAR_T]] = distinct !DICompositeType(tag: DW_TAG_class_type
+// CHECK-SAME:                          line: [[VAR_LINE]],
+// CHECK-SAME:                          elements: ![[VAR_ARGS:[0-9]+]]
+// CHECK: ![[VAR_ARGS]] = !{!{{[0-9]+}}}
+
 // CHECK: ![[INT:[0-9]+]] = !DIBasicType(name: "int"
 
 // A: 10
 // CHECK: ![[A_FUNC:.*]] = distinct !DISubprogram(name: "a"{{.*}}, line: [[A_LINE:[0-9]+]]{{.*}}, isDefinition: true
 
-// B: 14
-// CHECK: ![[B_FUNC:.*]] = distinct !DISubprogram(name: "b"{{.*}}, line: [[B_LINE:[0-9]+]]{{.*}}, isDefinition: true
-
-// C: 17
-// CHECK: ![[C_FUNC:.*]] = distinct !DISubprogram(name: "c"{{.*}}, line: [[C_LINE:[0-9]+]]{{.*}}, isDefinition: true
-
-// D: 18
-// CHECK: ![[D_FUNC:.*]] = distinct !DISubprogram(name: "d"{{.*}}, line: [[D_LINE:[0-9]+]]{{.*}}, isDefinition: true
-
-
 // Back to A. -- 78
-// CHECK: ![[LAM_A:.*]] = !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[A_FUNC]]{{.*}}, line: [[A_LINE]],
+// CHECK: ![[LAM_A:.*]] = distinct !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[A_FUNC]]{{.*}}, line: [[A_LINE]],
 // CHECK-SAME:                             elements: ![[LAM_A_ARGS:[0-9]+]]
 // CHECK: ![[LAM_A_ARGS]] = !{![[CON_LAM_A:[0-9]+]]}
 // CHECK: ![[CON_LAM_A]] = !DISubprogram(name: "operator()"
@@ -41,8 +49,11 @@ int d(int x) { D y[10]; return [x,y] { return y[x].x; }(); }
 // CHECK-SAME:                           line: [[A_LINE]]
 // CHECK-SAME:                           DIFlagPublic
 
+// B: 14
+// CHECK: ![[B_FUNC:.*]] = distinct !DISubprogram(name: "b"{{.*}}, line: [[B_LINE:[0-9]+]]{{.*}}, isDefinition: true
+
 // Back to B. -- 67
-// CHECK: ![[LAM_B:.*]] = !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[B_FUNC]]{{.*}}, line: [[B_LINE]],
+// CHECK: ![[LAM_B:.*]] = distinct !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[B_FUNC]]{{.*}}, line: [[B_LINE]],
 // CHECK-SAME:                             elements: ![[LAM_B_ARGS:[0-9]+]]
 // CHECK: ![[LAM_B_ARGS]] = !{![[CAP_B:[0-9]+]], ![[CON_LAM_B:[0-9]+]]}
 // CHECK: ![[CAP_B]] = !DIDerivedType(tag: DW_TAG_member, name: "x"
@@ -54,8 +65,11 @@ int d(int x) { D y[10]; return [x,y] { return y[x].x; }(); }
 // CHECK-SAME:                           line: [[B_LINE]]
 // CHECK-SAME:                           DIFlagPublic
 
+// C: 17
+// CHECK: ![[C_FUNC:.*]] = distinct !DISubprogram(name: "c"{{.*}}, line: [[C_LINE:[0-9]+]]{{.*}}, isDefinition: true
+
 // Back to C. -- 55
-// CHECK: ![[LAM_C:.*]] = !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[C_FUNC]]{{.*}}, line: [[C_LINE]],
+// CHECK: ![[LAM_C:.*]] = distinct !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[C_FUNC]]{{.*}}, line: [[C_LINE]],
 // CHECK-SAME:                             elements: ![[LAM_C_ARGS:[0-9]+]]
 // CHECK: ![[LAM_C_ARGS]] = !{![[CAP_C:[0-9]+]], ![[CON_LAM_C:[0-9]+]]}
 // CHECK: ![[CAP_C]] = !DIDerivedType(tag: DW_TAG_member, name: "x"
@@ -68,8 +82,11 @@ int d(int x) { D y[10]; return [x,y] { return y[x].x; }(); }
 // CHECK-SAME:                           line: [[C_LINE]]
 // CHECK-SAME:                           DIFlagPublic
 
+// D: 18
+// CHECK: ![[D_FUNC:.*]] = distinct !DISubprogram(name: "d"{{.*}}, line: [[D_LINE:[0-9]+]]{{.*}}, isDefinition: true
+
 // Back to D. -- 24
-// CHECK: ![[LAM_D:.*]] = !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[D_FUNC]]{{.*}}, line: [[D_LINE]],
+// CHECK: ![[LAM_D:.*]] = distinct !DICompositeType(tag: DW_TAG_class_type{{.*}}, scope: ![[D_FUNC]]{{.*}}, line: [[D_LINE]],
 // CHECK-SAME:                             elements: ![[LAM_D_ARGS:[0-9]+]]
 // CHECK: ![[LAM_D_ARGS]] = !{![[CAP_D_X:[0-9]+]], ![[CAP_D_Y:[0-9]+]], ![[CON_LAM_D:[0-9]+]]}
 // CHECK: ![[CAP_D_X]] = !DIDerivedType(tag: DW_TAG_member, name: "x"
@@ -82,21 +99,3 @@ int d(int x) { D y[10]; return [x,y] { return y[x].x; }(); }
 // CHECK-SAME:                           scope: ![[LAM_D]]
 // CHECK-SAME:                           line: [[D_LINE]]
 // CHECK-SAME:                           DIFlagPublic
-
-// CVAR:
-// CHECK: !DIGlobalVariable(name: "cvar"
-// CHECK-SAME:              line: [[CVAR_LINE:[0-9]+]]
-// CHECK-SAME:              type: ![[CVAR_T:[0-9]+]]
-// CHECK: ![[CVAR_T]] = !DICompositeType(tag: DW_TAG_class_type
-// CHECK-SAME:                           line: [[CVAR_LINE]],
-// CHECK-SAME:                           elements: ![[CVAR_ARGS:[0-9]+]]
-// CHECK: ![[CVAR_ARGS]] = !{!{{[0-9]+}}}
-
-// VAR:
-// CHECK: !DIGlobalVariable(name: "var"
-// CHECK-SAME:              line: [[VAR_LINE:[0-9]+]]
-// CHECK-SAME:              type: ![[VAR_T:[0-9]+]]
-// CHECK: ![[VAR_T]] = !DICompositeType(tag: DW_TAG_class_type
-// CHECK-SAME:                          line: [[VAR_LINE]],
-// CHECK-SAME:                          elements: ![[VAR_ARGS:[0-9]+]]
-// CHECK: ![[VAR_ARGS]] = !{!{{[0-9]+}}}
diff --git a/test/CodeGenCXX/debug-lambda-this.cpp b/test/CodeGenCXX/debug-lambda-this.cpp
index 0c413449a3ec9..4af3d81983b23 100644
--- a/test/CodeGenCXX/debug-lambda-this.cpp
+++ b/test/CodeGenCXX/debug-lambda-this.cpp
@@ -12,10 +12,11 @@ int D::d(int x) {
   }();
 }
 
+// CHECK: ![[D:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "D",
+// CHECK: ![[POINTER:.*]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: ![[D]], size: 64, align: 64)
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "this",
 // CHECK-SAME:           line: 11
-// CHECK-SAME:           baseType: ![[POINTER:[0-9]+]]
+// CHECK-SAME:           baseType: ![[POINTER]]
 // CHECK-SAME:           size: 64, align: 64
 // CHECK-NOT:            offset: 0
 // CHECK-SAME:           ){{$}}
-// CHECK: ![[POINTER]] = !DIDerivedType(tag: DW_TAG_pointer_type
diff --git a/test/CodeGenCXX/default_calling_conv.cpp b/test/CodeGenCXX/default_calling_conv.cpp
new file mode 100644
index 0000000000000..95c214a223d43
--- /dev/null
+++ b/test/CodeGenCXX/default_calling_conv.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -fdefault-calling-conv=cdecl -emit-llvm -o - %s | FileCheck %s --check-prefix=CDECL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i786-unknown-linux-gnu -target-feature +sse4.2 -fdefault-calling-conv=fastcall -emit-llvm -o - %s | FileCheck %s --check-prefix=FASTCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i486-unknown-linux-gnu -fdefault-calling-conv=stdcall -emit-llvm -o - %s | FileCheck %s --check-prefix=STDCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i486-unknown-linux-gnu -mrtd -emit-llvm -o - %s | FileCheck %s --check-prefix=STDCALL --check-prefix=ALL
+// RUN: %clang_cc1 -triple i986-unknown-linux-gnu -fdefault-calling-conv=vectorcall -emit-llvm -o - %s | FileCheck %s --check-prefix=VECTORCALL --check-prefix=ALL
+
+// CDECL: define void @_Z5test1v
+// FASTCALL: define x86_fastcallcc void @_Z5test1v
+// STDCALL: define x86_stdcallcc void @_Z5test1v
+// VECTORCALL: define x86_vectorcallcc void @_Z5test1v
+void test1() {}
+
+// ALL: define void @_Z5test2v
+void __attribute__((cdecl)) test2() {}
+
+// ALL: define x86_fastcallcc void @_Z5test3v
+void __attribute__((fastcall)) test3() {}
+
+// ALL: define x86_stdcallcc void @_Z5test4v
+void __attribute__((stdcall)) test4() {}
+
+// ALL: define  x86_vectorcallcc void @_Z5test5v
+void __attribute__((vectorcall)) test5() {}
+
+// ALL: define linkonce_odr void @_ZN1A11test_memberEv
+class A {
+public:
+  void test_member() {}
+};
+
+void test() {
+  A a;
+  a.test_member();
+}
diff --git a/test/CodeGenCXX/delete-two-arg.cpp b/test/CodeGenCXX/delete-two-arg.cpp
index 85275b3eb1764..68a6fa6736eb1 100644
--- a/test/CodeGenCXX/delete-two-arg.cpp
+++ b/test/CodeGenCXX/delete-two-arg.cpp
@@ -27,7 +27,7 @@ namespace test2 {
 
   // CHECK: define [[A:%.*]]* @_ZN5test24testEv()
   A *test() {
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znaj(i32 44)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znaj(i32 44)
     // CHECK-NEXT: [[T0:%.*]] = bitcast i8* [[NEW]] to i32*
     // CHECK-NEXT: store i32 10, i32* [[T0]]
     // CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i8, i8* [[NEW]], i32 4
@@ -63,7 +63,7 @@ namespace test3 {
 
   // CHECK-LABEL: define void @_ZN5test34testEv()
   void test() {
-    // CHECK:      call noalias i8* @_Znaj(i32 24)
+    // CHECK:      call i8* @_Znaj(i32 24)
     // CHECK-NEXT: bitcast
     // CHECK-NEXT: store i32 5
     (void) new B[5];
diff --git a/test/CodeGenCXX/destructors.cpp b/test/CodeGenCXX/destructors.cpp
index 529603142d314..d6aabee58b9e6 100644
--- a/test/CodeGenCXX/destructors.cpp
+++ b/test/CodeGenCXX/destructors.cpp
@@ -4,6 +4,9 @@
 // RUN: FileCheck --check-prefix=CHECK3 --input-file=%t %s
 // RUN: FileCheck --check-prefix=CHECK4 --input-file=%t %s
 // RUN: FileCheck --check-prefix=CHECK5 --input-file=%t %s
+// RUN: %clang_cc1 %s -triple x86_64-apple-darwin10 -emit-llvm -o - -fcxx-exceptions -fexceptions -O1 -disable-llvm-optzns -std=c++11 > %t2
+// RUN: FileCheck --check-prefix=CHECK6 --input-file=%t2 %s
+// REQUIRES: asserts
 
 struct A {
   int a;
@@ -428,3 +431,64 @@ namespace test10 {
     return true;
   }
 }
+
+#if __cplusplus >= 201103L
+namespace test11 {
+
+// Check that lifetime.end is emitted in the landing pad.
+
+// CHECK6-LABEL: define void @_ZN6test1115testLifetimeEndEi(
+// CHECK6: entry:
+// CHECK6: [[T1:%[a-z0-9]+]] = alloca %"struct.test11::S1"
+// CHECK6: [[T2:%[a-z0-9]+]] = alloca %"struct.test11::S1"
+// CHECK6: [[T3:%[a-z0-9]+]] = alloca %"struct.test11::S1"
+
+// CHECK6: {{^}}invoke.cont
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T1]])
+// CHECK6: [[BC1:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T1]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC1]])
+// CHECK6: {{^}}lpad
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T1]])
+// CHECK6: [[BC2:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T1]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC2]])
+
+// CHECK6: {{^}}invoke.cont
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T2]])
+// CHECK6: [[BC3:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T2]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC3]])
+// CHECK6: {{^}}lpad
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T2]])
+// CHECK6: [[BC4:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T2]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC4]])
+
+// CHECK6: {{^}}invoke.cont
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T3]])
+// CHECK6: [[BC5:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T3]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC5]])
+// CHECK6: {{^}}lpad
+// CHECK6: call void @_ZN6test112S1D1Ev(%"struct.test11::S1"* [[T3]])
+// CHECK6: [[BC6:%[a-z0-9]+]] = bitcast %"struct.test11::S1"* [[T3]] to i8*
+// CHECK6: call void @llvm.lifetime.end(i64 32, i8* [[BC6]])
+
+  struct S1 {
+    ~S1();
+    int a[8];
+  };
+
+  void func1(S1 &) noexcept(false);
+
+  void testLifetimeEnd(int n) {
+    if (n < 10) {
+      S1 t1;
+      func1(t1);
+    } else if (n < 100) {
+      S1 t2;
+      func1(t2);
+    } else if (n < 1000) {
+      S1 t3;
+      func1(t3);
+    }
+  }
+
+}
+#endif
diff --git a/test/CodeGenCXX/discard-name-values.cpp b/test/CodeGenCXX/discard-name-values.cpp
new file mode 100644
index 0000000000000..49cb7d2fc0587
--- /dev/null
+++ b/test/CodeGenCXX/discard-name-values.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -emit-llvm  -triple=armv7-apple-darwin -emit-llvm -std=c++11 %s -o - -O1 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm  -triple=armv7-apple-darwin -emit-llvm -std=c++11 %s -o - -O1 -discard-value-names | FileCheck %s --check-prefix=DISCARDVALUE
+
+int foo(int bar) {
+  return bar;
+}
+
+// CHECK: ret i32 %bar
+// DISCARDVALUE: ret i32 %0
+
diff --git a/test/CodeGenCXX/dllexport-members.cpp b/test/CodeGenCXX/dllexport-members.cpp
index 76f692dbd3017..1c56251328d64 100644
--- a/test/CodeGenCXX/dllexport-members.cpp
+++ b/test/CodeGenCXX/dllexport-members.cpp
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 -triple i686-windows-msvc -fms-compatibility   -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M32 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=18 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M32VS2015 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-compatibility -fms-compatibility-version=19 -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=M64VS2015 %s
 // RUN: %clang_cc1 -triple i686-windows-gnu                       -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G32 %s
 // RUN: %clang_cc1 -triple x86_64-windows-gnu                     -emit-llvm -std=c++1y -O0 -o - %s | FileCheck --check-prefix=GNU --check-prefix=G64 %s
 
@@ -427,6 +429,32 @@ __declspec(dllexport) ExportDefaultedDefs::ExportDefaultedDefs(ExportDefaultedDe
 ExportDefaultedDefs& ExportDefaultedDefs::operator=(ExportDefaultedDefs&&) = default;
 
 
+// Export defaulted member function definitions declared inside class.
+struct ExportDefaultedInclassDefs {
+  __declspec(dllexport) ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+
+  __declspec(dllexport) ~ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2013-DAG: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2015-NOT: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+
+  __declspec(dllexport) ExportDefaultedInclassDefs(const ExportDefaultedInclassDefs&) = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+
+  __declspec(dllexport) ExportDefaultedInclassDefs& operator=(const ExportDefaultedInclassDefs&) = default;
+  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QAEAAU0@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64-DAG: define weak_odr dllexport                dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QEAAAEAU0@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+};
+
+
 // Export allocation functions.
 struct ExportAlloc {
   __declspec(dllexport) void* operator new(__SIZE_TYPE__);
diff --git a/test/CodeGenCXX/dllexport-ms-friend.cpp b/test/CodeGenCXX/dllexport-ms-friend.cpp
new file mode 100644
index 0000000000000..7bcf5905e5e58
--- /dev/null
+++ b/test/CodeGenCXX/dllexport-ms-friend.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple %ms_abi_triple -fms-extensions -emit-llvm -O0 -o - %s | FileCheck %s
+
+// Friend functions defined in classes are emitted.
+// CHECK: define weak_odr dllexport void @"\01?friend1@@YAXXZ"()
+struct FuncFriend1 {
+  friend __declspec(dllexport) void friend1() {}
+};
+
+// But function templates and functions defined in class templates are not
+// emitted.
+// CHECK-NOT: friend2
+// CHECK-NOT: friend3
+// CHECK-NOT: friend4
+struct FuncFriend2 {
+  template<typename> friend __declspec(dllexport) void friend2() {}
+};
+template<typename> struct FuncFriend3 {
+  friend __declspec(dllexport) void friend3() {}
+  struct Inner {
+    friend __declspec(dllexport) void friend4() {}
+  };
+};
diff --git a/test/CodeGenCXX/dllexport-pr26549.cpp b/test/CodeGenCXX/dllexport-pr26549.cpp
new file mode 100644
index 0000000000000..ceb2e0685ef18
--- /dev/null
+++ b/test/CodeGenCXX/dllexport-pr26549.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -fms-extensions -triple x86_64-windows-msvc -emit-llvm -o - | FileCheck %s
+
+template <typename> struct MessageT { };
+extern template struct MessageT<int>;
+
+// CHECK: define weak_odr dllexport {{.*}} %struct.MessageT* @"\01??4?$MessageT@H@@QEAAAEAU0@AEBU0@@Z"(
+template struct __declspec(dllexport) MessageT<int>;
+// Previously we crashed when this dllexport was the last thing in the file.
+// DO NOT ADD MORE TESTS AFTER THIS LINE!
diff --git a/test/CodeGenCXX/dllexport.cpp b/test/CodeGenCXX/dllexport.cpp
index 1412ad866bd39..7cef7c2d127ef 100644
--- a/test/CodeGenCXX/dllexport.cpp
+++ b/test/CodeGenCXX/dllexport.cpp
@@ -1,5 +1,9 @@
-// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w | FileCheck --check-prefix=MSC --check-prefix=M32 %s
-// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=MSC --check-prefix=M64 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w -fms-compatibility-version=19.00 | FileCheck --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2015 -check-prefix=M32MSVC2015 %s
+// RUN: %clang_cc1 -triple i686-windows-msvc   -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O1 -mconstructor-aliases -disable-llvm-optzns -o - %s -w -fms-compatibility-version=18.00 | FileCheck --check-prefix=MSC --check-prefix=M32 -check-prefix=MSVC2013 -check-prefix=M32MSVC2013 %s
+
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=19.00 | FileCheck --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2015 -check-prefix=M64MSVC2015 %s
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w -fms-compatibility-version=18.00 | FileCheck --check-prefix=MSC --check-prefix=M64 -check-prefix=MSVC2013 -check-prefix=M64MSVC2013 %s
+
 // RUN: %clang_cc1 -triple i686-windows-gnu    -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=G32 %s
 // RUN: %clang_cc1 -triple x86_64-windows-gnu  -emit-llvm -std=c++1y -fno-threadsafe-statics -fms-extensions -O0 -o - %s -w | FileCheck --check-prefix=GNU --check-prefix=G64 %s
 
@@ -486,7 +490,7 @@ struct S {
 
 struct CtorWithClosure {
   __declspec(dllexport) CtorWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FCtorWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 // M32-DAG:   %[[this_addr:.*]] = alloca %struct.CtorWithClosure*, align 4
 // M32-DAG:   store %struct.CtorWithClosure* %this, %struct.CtorWithClosure** %[[this_addr]], align 4
 // M32-DAG:   %[[this:.*]] = load %struct.CtorWithClosure*, %struct.CtorWithClosure** %[[this_addr]]
@@ -503,7 +507,7 @@ struct CtorWithClosure {
 struct __declspec(dllexport) ClassWithClosure {
   DELETE_IMPLICIT_MEMBERS(ClassWithClosure);
   ClassWithClosure(...) {}
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FClassWithClosure@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 // M32-DAG:   %[[this_addr:.*]] = alloca %struct.ClassWithClosure*, align 4
 // M32-DAG:   store %struct.ClassWithClosure* %this, %struct.ClassWithClosure** %[[this_addr]], align 4
 // M32-DAG:   %[[this:.*]] = load %struct.ClassWithClosure*, %struct.ClassWithClosure** %[[this_addr]]
@@ -520,17 +524,19 @@ struct __declspec(dllexport) NestedOuter {
   };
 };
 
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) comdat
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FNestedInner@NestedOuter@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 template <typename T>
 struct SomeTemplate {
   SomeTemplate(T o = T()) : o(o) {}
   T o;
 };
+// MSVC2015-DAG: define weak_odr dllexport {{.+}} @"\01??4?$SomeTemplate@H@@Q{{.+}}@$$Q{{.+}}@@Z"
+// MSVC2013-DAG: define weak_odr dllexport {{.+}} @"\01??4?$SomeTemplate@H@@Q{{.+}}0@A{{.+}}0@@Z"
 struct __declspec(dllexport) InheritFromTemplate : SomeTemplate<int> {};
 
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$SomeTemplate@H@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_F?$SomeTemplate@H@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 namespace PR23801 {
 template <typename>
@@ -547,7 +553,7 @@ struct __declspec(dllexport) B {
 
 }
 //
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@PR23801@@QAEXXZ"({{.*}}) comdat
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@PR23801@@QAEXXZ"({{.*}}) {{#[0-9]+}} comdat
 
 struct __declspec(dllexport) T {
   // Copy assignment operator:
@@ -555,7 +561,7 @@ struct __declspec(dllexport) T {
 
   // Explicitly defaulted copy constructur:
   T(const T&) = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.T* @"\01??0T@@QAE@ABU0@@Z"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.T* @"\01??0T@@QAE@ABU0@@Z"
 
   void a() {}
   // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?a@T@@QAEXXZ"
@@ -570,6 +576,30 @@ struct __declspec(dllexport) T {
 USEVAR(T::b)
 int T::c;
 
+// Export template class with static member variable
+// MSC-DAG: @"\01?StaticClassVarExpTmplClass@?$TmplClass@H@@2HA" = weak_odr dllexport global i32 0, comdat, align 4
+// GNU-DAG: @_ZN9TmplClassIiE26StaticClassVarExpTmplClassE = weak_odr dllexport global i32 0, comdat, align 4
+template<typename T>
+struct __declspec(dllexport) TmplClass
+{
+  static T StaticClassVarExpTmplClass;
+};
+
+template<typename T>
+T TmplClass<T>::StaticClassVarExpTmplClass;
+
+// Export a definition of a template function.
+// MSC-DAG: define weak_odr dllexport i32 @"\01??$TypeFunTmpl@H@@YAHH@Z"
+// GNU-DAG: define weak_odr dllexport i32 @_Z11TypeFunTmplIiET_S0_
+template<typename T>
+T __declspec(dllexport) TypeFunTmpl(T t) { return t + t; }
+
+// Instantiate the exported template class and the exported template function.
+int useExportedTmplStaticAndFun()
+{
+  return TmplClass<int>::StaticClassVarExpTmplClass + TypeFunTmpl<int>(10);
+}
+
 template <typename T> struct __declspec(dllexport) U { void foo() {} };
 struct __declspec(dllexport) V : public U<int> { };
 // U<int>'s assignment operator is emitted.
@@ -592,7 +622,8 @@ struct __declspec(dllexport) X : public virtual W {};
 
 struct __declspec(dllexport) Y {
   // Move assignment operator:
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.Y* @"\01??4Y@@QAEAAU0@$$QAU0@@Z"
+  // MSVC2015-DAG: define weak_odr dllexport {{.+}} @"\01??4Y@@Q{{.+}}@$$Q{{.+}}@@Z"
+  // MSVC2013-DAG: define weak_odr dllexport {{.+}} @"\01??4Y@@Q{{.+}}0@A{{.+}}0@@Z"
 
   int x;
 };
@@ -616,9 +647,34 @@ namespace UseDtorAlias {
 
 struct __declspec(dllexport) DefaultedCtorsDtors {
   DefaultedCtorsDtors() = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.DefaultedCtorsDtors* @"\01??0DefaultedCtorsDtors@@QAE@XZ"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.DefaultedCtorsDtors* @"\01??0DefaultedCtorsDtors@@QAE@XZ"
   ~DefaultedCtorsDtors() = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1DefaultedCtorsDtors@@QAE@XZ"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1DefaultedCtorsDtors@@QAE@XZ"
+};
+
+// Export defaulted member function definitions declared inside class.
+struct __declspec(dllexport) ExportDefaultedInclassDefs {
+  ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* returned %this)
+
+  ~ExportDefaultedInclassDefs() = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2013-DAG: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc void @"\01??1ExportDefaultedInclassDefs@@QAE@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+  // M64VS2015-NOT: define weak_odr dllexport                void @"\01??1ExportDefaultedInclassDefs@@QEAA@XZ"(%struct.ExportDefaultedInclassDefs* %this)
+
+  ExportDefaultedInclassDefs(const ExportDefaultedInclassDefs&) = default;
+  // M32VS2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2013-DAG: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M32VS2015-NOT: define weak_odr dllexport x86_thiscallcc %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QAE@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64VS2015-NOT: define weak_odr dllexport                %struct.ExportDefaultedInclassDefs* @"\01??0ExportDefaultedInclassDefs@@QEAA@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* returned %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+
+  ExportDefaultedInclassDefs& operator=(const ExportDefaultedInclassDefs&) = default;
+  // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QAEAAU0@ABU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
+  // M64-DAG: define weak_odr dllexport                dereferenceable({{[0-9]+}}) %struct.ExportDefaultedInclassDefs* @"\01??4ExportDefaultedInclassDefs@@QEAAAEAU0@AEBU0@@Z"(%struct.ExportDefaultedInclassDefs* %this, %struct.ExportDefaultedInclassDefs* dereferenceable({{[0-9]+}}))
 };
 
 namespace ReferencedInlineMethodInNestedClass {
@@ -690,7 +746,7 @@ template <typename T> struct ExplicitInstConstexprMembers {
   // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@XZ"
 
   ExplicitInstConstexprMembers(const ExplicitInstConstexprMembers&) = default;
-  // M32-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@ABU0@@Z"
+  // M32MSVC2013-DAG: define weak_odr dllexport x86_thiscallcc %struct.ExplicitInstConstexprMembers* @"\01??0?$ExplicitInstConstexprMembers@X@@QAE@ABU0@@Z"
 
   constexpr int f() const { return 42; }
   // M32-DAG: define weak_odr dllexport x86_thiscallcc i32 @"\01?f@?$ExplicitInstConstexprMembers@X@@QBEHXZ"
@@ -777,6 +833,22 @@ struct __declspec(dllexport) Baz {
 // M32-DAG: define weak_odr dllexport x86_thiscallcc dereferenceable(1) %"struct.InClassInits::Baz"* @"\01??4Baz@InClassInits@@QAEAAU01@ABU01@@Z"
 }
 
+// We had an issue where instantiating A would force emission of B's delayed
+// exported methods.
+namespace pr26490 {
+template <typename T> struct A { };
+struct __declspec(dllexport) B {
+  B(int = 0) {}
+  A<int> m_fn1() {}
+};
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01??_FB@pr26490@@QAEXXZ"
+}
+
+// dllexport trumps dllexport on an explicit instantiation.
+template <typename T> struct ExplicitInstantiationTwoAttributes { void f() {} };
+template struct __declspec(dllexport) __declspec(dllimport) ExplicitInstantiationTwoAttributes<int>;
+// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?f@?$ExplicitInstantiationTwoAttributes@H@@QAEXXZ"
+
 
 //===----------------------------------------------------------------------===//
 // Classes with template base classes
@@ -891,10 +963,26 @@ template struct ExplicitInstantiationDeclTemplateBase<int>;
 // M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?func@?$ExplicitInstantiationDeclTemplateBase@H@@QAEXXZ"
 // G32-DAG: define weak_odr x86_thiscallcc void @_ZN37ExplicitInstantiationDeclTemplateBaseIiE4funcEv
 
-template <typename T> struct ExplicitInstantiationDeclTemplateBase2 { void func() {} };
-extern template struct ExplicitInstantiationDeclTemplateBase2<int>;
-struct __declspec(dllexport) DerivedFromExplicitInstantiationDeclTemplateBase2 : public ExplicitInstantiationDeclTemplateBase2<int> {};
-template struct __declspec(dllimport) ExplicitInstantiationDeclTemplateBase2<int>;
-USEMEMFUNC(ExplicitInstantiationDeclTemplateBase2<int>, func)
-// M32-DAG: define weak_odr dllexport x86_thiscallcc void @"\01?func@?$ExplicitInstantiationDeclTemplateBase2@H@@QAEXXZ"
-// G32-DAG: define weak_odr x86_thiscallcc void @_ZN38ExplicitInstantiationDeclTemplateBase2IiE4funcEv
+// PR26076
+struct LayerSelectionBound;
+template <typename> struct Selection {};
+typedef Selection<LayerSelectionBound> LayerSelection;
+struct LayerImpl;
+struct __declspec(dllexport) LayerTreeImpl {
+  struct __declspec(dllexport) ElementLayers {
+    LayerImpl *main = nullptr;
+  };
+  LayerSelection foo;
+};
+// M32-DAG: define weak_odr dllexport x86_thiscallcc %"struct.LayerTreeImpl::ElementLayers"* @"\01??0ElementLayers@LayerTreeImpl@@QAE@XZ"
+// M64-DAG: define weak_odr dllexport %"struct.LayerTreeImpl::ElementLayers"* @"\01??0ElementLayers@LayerTreeImpl@@QEAA@XZ"
+
+class __declspec(dllexport) ACE_Shared_Object {
+public:
+  virtual ~ACE_Shared_Object();
+};
+class __declspec(dllexport) ACE_Service_Object : public ACE_Shared_Object {};
+// Implicit move constructor declaration.
+// MSVC2015-DAG: define weak_odr dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
+// The declarations should not be exported.
+// MSVC2013-NOT: define weak_odr dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
diff --git a/test/CodeGenCXX/dllimport-members.cpp b/test/CodeGenCXX/dllimport-members.cpp
index e88b7e97c3e7d..1fed1bf0acf1b 100644
--- a/test/CodeGenCXX/dllimport-members.cpp
+++ b/test/CodeGenCXX/dllimport-members.cpp
@@ -63,8 +63,8 @@ struct ForceNonTrivial {
 struct ImportMembers {
   struct Nested;
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?normalDef@ImportMembers@@QAEXXZ"(%struct.ImportMembers* %this)
-  // M64-DAG: define                             void @"\01?normalDef@ImportMembers@@QEAAXXZ"(%struct.ImportMembers* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?normalDef@ImportMembers@@QAEXXZ"(%struct.ImportMembers* %this)
+  // M64-DAG: define  dllexport                  void @"\01?normalDef@ImportMembers@@QEAAXXZ"(%struct.ImportMembers* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalDecl@ImportMembers@@QAEXXZ"(%struct.ImportMembers*)
   // M64-DAG: declare dllimport                  void @"\01?normalDecl@ImportMembers@@QEAAXXZ"(%struct.ImportMembers*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalInclass@ImportMembers@@QAEXXZ"(%struct.ImportMembers*)
@@ -95,8 +95,8 @@ struct ImportMembers {
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?virtualDef@ImportMembers@@UAEXXZ"(%struct.ImportMembers* %this)
-  // M64-DAG: define                             void @"\01?virtualDef@ImportMembers@@UEAAXXZ"(%struct.ImportMembers* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?virtualDef@ImportMembers@@UAEXXZ"(%struct.ImportMembers* %this)
+  // M64-DAG: define  dllexport                  void @"\01?virtualDef@ImportMembers@@UEAAXXZ"(%struct.ImportMembers* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualDecl@ImportMembers@@UAEXXZ"(%struct.ImportMembers*)
   // M64-DAG: declare dllimport                  void @"\01?virtualDecl@ImportMembers@@UEAAXXZ"(%struct.ImportMembers*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualInclass@ImportMembers@@UAEXXZ"(%struct.ImportMembers*)
@@ -127,7 +127,7 @@ struct ImportMembers {
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 
-  // MSC-DAG: define                           void @"\01?staticDef@ImportMembers@@SAXXZ"()
+  // MSC-DAG: define  dllexport                void @"\01?staticDef@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticDecl@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInclass@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInlineDef@ImportMembers@@SAXXZ"()
@@ -235,8 +235,8 @@ USEMV(ImportMembers, ConstexprField)
 
 // Import individual members of a nested class.
 struct ImportMembers::Nested {
-  // M32-DAG: define              x86_thiscallcc void @"\01?normalDef@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"* %this)
-  // M64-DAG: define                             void @"\01?normalDef@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?normalDef@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M64-DAG: define  dllexport                  void @"\01?normalDef@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalDecl@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"*)
   // M64-DAG: declare dllimport                  void @"\01?normalDecl@Nested@ImportMembers@@QEAAXXZ"(%"struct.ImportMembers::Nested"*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?normalInclass@Nested@ImportMembers@@QAEXXZ"(%"struct.ImportMembers::Nested"*)
@@ -267,8 +267,8 @@ struct ImportMembers::Nested {
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 
-  // M32-DAG: define              x86_thiscallcc void @"\01?virtualDef@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"* %this)
-  // M64-DAG: define                             void @"\01?virtualDef@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M32-DAG: define  dllexport   x86_thiscallcc void @"\01?virtualDef@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"* %this)
+  // M64-DAG: define  dllexport                  void @"\01?virtualDef@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"* %this)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualDecl@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"*)
   // M64-DAG: declare dllimport                  void @"\01?virtualDecl@Nested@ImportMembers@@UEAAXXZ"(%"struct.ImportMembers::Nested"*)
   // M32-DAG: declare dllimport   x86_thiscallcc void @"\01?virtualInclass@Nested@ImportMembers@@UAEXXZ"(%"struct.ImportMembers::Nested"*)
@@ -300,7 +300,7 @@ struct ImportMembers::Nested {
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 
-  // MSC-DAG: define                           void @"\01?staticDef@Nested@ImportMembers@@SAXXZ"()
+  // MSC-DAG: define  dllexport                void @"\01?staticDef@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticDecl@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInclass@Nested@ImportMembers@@SAXXZ"()
   // MSC-DAG: declare dllimport                void @"\01?staticInlineDef@Nested@ImportMembers@@SAXXZ"()
@@ -595,16 +595,16 @@ inline ImportDefaultedDefs::ImportDefaultedDefs(const ImportDefaultedDefs&) = de
 // G64-DAG: define linkonce_odr                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSERKS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 inline ImportDefaultedDefs& ImportDefaultedDefs::operator=(const ImportDefaultedDefs&) = default;
 
-// M32-DAG: define x86_thiscallcc %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QAE@$$QAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
-// M64-DAG: define                %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QEAA@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M32-DAG: define dllexport x86_thiscallcc %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QAE@$$QAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M64-DAG: define dllexport                %struct.ImportDefaultedDefs* @"\01??0ImportDefaultedDefs@@QEAA@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* returned %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc void @_ZN19ImportDefaultedDefsC1EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                void @_ZN19ImportDefaultedDefsC1EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc void @_ZN19ImportDefaultedDefsC2EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                void @_ZN19ImportDefaultedDefsC2EOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 ImportDefaultedDefs::ImportDefaultedDefs(ImportDefaultedDefs&&) = default; // dllimport ignored
 
-// M32-DAG: define x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
-// M64-DAG: define                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M32-DAG: define dllexport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QAEAAU0@$$QAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
+// M64-DAG: define dllexport                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @"\01??4ImportDefaultedDefs@@QEAAAEAU0@$$QEAU0@@Z"(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G32-DAG: define x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSEOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 // G64-DAG: define                dereferenceable({{[0-9]+}}) %struct.ImportDefaultedDefs* @_ZN19ImportDefaultedDefsaSEOS_(%struct.ImportDefaultedDefs* %this, %struct.ImportDefaultedDefs* dereferenceable({{[0-9]+}}))
 ImportDefaultedDefs& ImportDefaultedDefs::operator=(ImportDefaultedDefs&&) = default; // dllimport ignored
diff --git a/test/CodeGenCXX/dllimport-rtti.cpp b/test/CodeGenCXX/dllimport-rtti.cpp
index 071ce278a5bbd..4baee50fcff3a 100644
--- a/test/CodeGenCXX/dllimport-rtti.cpp
+++ b/test/CodeGenCXX/dllimport-rtti.cpp
@@ -4,7 +4,8 @@
 struct __declspec(dllimport) S {
   virtual void f() {}
 } s;
-// MSVC-DAG: @"\01??_7S@@6B@" = available_externally dllimport
+// MSVC: [[VF_S:.*]] = private unnamed_addr constant [2 x i8*]
+// MSVC-DAG: @"\01??_SS@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VF_S]], i32 0, i32 1)
 // MSVC-DAG: @"\01??_R0?AUS@@@8" = linkonce_odr
 // MSVC-DAG: @"\01??_R1A@?0A@EA@S@@8" = linkonce_odr
 // MSVC-DAG: @"\01??_R2S@@8" = linkonce_odr
diff --git a/test/CodeGenCXX/dllimport.cpp b/test/CodeGenCXX/dllimport.cpp
index b9c850b8b87b1..aff240f28769a 100644
--- a/test/CodeGenCXX/dllimport.cpp
+++ b/test/CodeGenCXX/dllimport.cpp
@@ -27,6 +27,7 @@ struct ExplicitSpec_NotImported {};
 #define USEVAR(var) USEVARTYPE(int, var)
 #define USE(func) void UNIQ(use)() { func(); }
 #define USEMEMFUNC(class, func) void (class::*UNIQ(use)())() { return &class::func; }
+#define USESTATICMEMFUNC(class, func) void (*UNIQ(use)())() { return &class::func; }
 #define USECLASS(class) void UNIQ(USE)() { class x; }
 #define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(class&) { return &class::operator=; }
 #define USEMOVEASSIGN(class) class& (class::*UNIQ(use)())(class&&) { return &class::operator=; }
@@ -263,7 +264,7 @@ __declspec(dllimport) void redecl2();
                       void redecl2();
 USE(redecl2)
 
-// MSC-DAG: define void @"\01?redecl3@@YAXXZ"()
+// MSC-DAG: define dllexport void @"\01?redecl3@@YAXXZ"()
 // GNU-DAG: define void @_Z7redecl3v()
 __declspec(dllimport) void redecl3();
                       void redecl3() {} // dllimport ignored
@@ -275,7 +276,7 @@ USE(redecl3)
 // GNU-DAG: declare dllimport void @_Z7friend1v()
 // MSC-DAG: declare           void @"\01?friend2@@YAXXZ"()
 // GNU-DAG: declare           void @_Z7friend2v()
-// MSC-DAG: define            void @"\01?friend3@@YAXXZ"()
+// MSC-DAG: define  dllexport void @"\01?friend3@@YAXXZ"()
 // GNU-DAG: define            void @_Z7friend3v()
 // MSC-DAG: declare           void @"\01?friend4@@YAXXZ"()
 // GNU-DAG: declare           void @_Z7friend4v()
@@ -590,6 +591,10 @@ struct __declspec(dllimport) T {
   void a() {}
   // MO1-DAG: define available_externally dllimport x86_thiscallcc void @"\01?a@T@@QAEXXZ"
 
+  static void StaticMethod();
+  // MSC-DAG: declare dllimport void @"\01?StaticMethod@T@@SAXXZ"()
+  // GNU-DAG: declare dllimport void @_ZN1T12StaticMethodEv()
+
   static int b;
   // MO1-DAG: @"\01?b@T@@2HA" = external dllimport global i32
 
@@ -602,6 +607,7 @@ struct __declspec(dllimport) T {
   // M19-DAG: define available_externally dllimport x86_thiscallcc dereferenceable({{[0-9]+}}) %struct.T* @"\01??4T@@QAEAAU0@$$QAU0@@Z"
 };
 USEMEMFUNC(T, a)
+USESTATICMEMFUNC(T, StaticMethod)
 USEVAR(T::b)
 USECOPYASSIGN(T)
 USEMOVEASSIGN(T)
@@ -614,7 +620,7 @@ USEMEMFUNC(V, foo)
 struct __declspec(dllimport) W { virtual void foo() {} };
 USECLASS(W)
 // vftable:
-// MO1-DAG: @"\01??_7W@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.W*)* @"\01?foo@W@@UAEXXZ" to i8*)]
+// MO1-DAG: @"\01??_SW@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (void (%struct.W*)* @"\01?foo@W@@UAEXXZ" to i8*)]
 // GO1-DAG: @_ZTV1W = available_externally dllimport unnamed_addr constant [3 x i8*] [i8* null, i8* null, i8* bitcast (void (%struct.W*)* @_ZN1W3fooEv to i8*)]
 
 struct __declspec(dllimport) KeyFuncClass {
@@ -650,7 +656,7 @@ namespace DontUseDtorAlias {
 
 namespace Vtordisp {
   // Don't dllimport the vtordisp.
-  // MO1-DAG: define linkonce_odr x86_thiscallcc void @"\01?f@?$C@D@Vtordisp@@$4PPPPPPPM@A@AEXXZ"
+  // MO1-DAG: define linkonce_odr x86_thiscallcc void @"\01?f@?$C@H@Vtordisp@@$4PPPPPPPM@A@AEXXZ"
 
   class Base {
     virtual void f() {}
@@ -661,7 +667,7 @@ namespace Vtordisp {
     C() {}
     virtual void f() {}
   };
-  template class C<char>;
+  USECLASS(C<int>);
 }
 
 namespace ClassTemplateStaticDef {
@@ -670,7 +676,7 @@ namespace ClassTemplateStaticDef {
     static int x;
   };
   template <typename T> int S<T>::x;
-  // MSC-DAG: @"\01?x@?$S@H@ClassTemplateStaticDef@@2HA" = available_externally dllimport global i32 0
+  // MSC-DAG: @"\01?x@?$S@H@ClassTemplateStaticDef@@2HA" = external dllimport global i32
   int f() { return S<int>::x; }
 
   // Partial class template specialization static field:
@@ -679,7 +685,7 @@ namespace ClassTemplateStaticDef {
     static int x;
   };
   template <typename A> int T<A*>::x;
-  // GNU-DAG: @_ZN22ClassTemplateStaticDef1TIPvE1xE = available_externally dllimport global i32 0
+  // GNU-DAG: @_ZN22ClassTemplateStaticDef1TIPvE1xE = external dllimport global i32
   int g() { return T<void*>::x; }
 }
 
@@ -692,26 +698,31 @@ namespace PR19933 {
   template <typename T> struct A { static NonPOD x; };
   template <typename T> NonPOD A<T>::x;
   template struct __declspec(dllimport) A<int>;
-  // MSC-DAG: @"\01?x@?$A@H@PR19933@@2UNonPOD@2@A" = available_externally dllimport global %"struct.PR19933::NonPOD" zeroinitializer
+  USEVARTYPE(NonPOD, A<int>::x);
+  // MSC-DAG: @"\01?x@?$A@H@PR19933@@2UNonPOD@2@A" = external dllimport global %"struct.PR19933::NonPOD"
 
   int f();
   template <typename T> struct B { static int x; };
   template <typename T> int B<T>::x = f();
   template struct __declspec(dllimport) B<int>;
-  // MSC-DAG: @"\01?x@?$B@H@PR19933@@2HA" = available_externally dllimport global i32 0
+  USEVAR(B<int>::x);
+  // MSC-DAG: @"\01?x@?$B@H@PR19933@@2HA" = external dllimport global i32
 
   constexpr int g() { return 42; }
   template <typename T> struct C { static int x; };
   template <typename T> int C<T>::x = g();
   template struct __declspec(dllimport) C<int>;
-  // MSC-DAG: @"\01?x@?$C@H@PR19933@@2HA" = available_externally dllimport global i32 42
+  USEVAR(C<int>::x);
+  // MSC-DAG: @"\01?x@?$C@H@PR19933@@2HA" = external dllimport global i32
 
   template <int I> struct D { static int x, y; };
   template <int I> int D<I>::x = I + 1;
   template <int I> int D<I>::y = I + f();
   template struct __declspec(dllimport) D<42>;
-  // MSC-DAG: @"\01?x@?$D@$0CK@@PR19933@@2HA" = available_externally dllimport global i32 43
-  // MSC-DAG: @"\01?y@?$D@$0CK@@PR19933@@2HA" = available_externally dllimport global i32 0
+  USEVAR(D<42>::x);
+  USEVAR(D<42>::y);
+  // MSC-DAG: @"\01?x@?$D@$0CK@@PR19933@@2HA" = external dllimport global i32
+  // MSC-DAG: @"\01?y@?$D@$0CK@@PR19933@@2HA" = external dllimport global i32
 }
 
 namespace PR21355 {
@@ -737,6 +748,17 @@ namespace PR21366 {
   inline void S::outOfClassInlineMethod() {}
 }
 
+namespace PR27319 {
+  // Make sure we don't assert due to not having checked for operator delete on
+  // the destructor.
+  template <typename> struct A {
+    virtual ~A() = default;
+  };
+  extern template struct __declspec(dllimport) A<int>;
+  void f() { new A<int>(); }
+  // MO1-DAG: @"\01??_S?$A@H@PR27319@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*]
+}
+
 // MS ignores DLL attributes on partial specializations.
 template <typename T> struct PartiallySpecializedClassTemplate {};
 template <typename T> struct __declspec(dllimport) PartiallySpecializedClassTemplate<T*> { void f(); };
@@ -788,6 +810,36 @@ template struct __declspec(dllimport) PR23770DerivedTemplate<int>;
 USEMEMFUNC(PR23770BaseTemplate<int>, f);
 // M32-DAG: declare dllimport x86_thiscallcc void @"\01?f@?$PR23770BaseTemplate@H@@QAEXXZ"
 
+namespace PR27810 {
+  template <class T>
+  struct basic_ostream {
+    struct sentry {
+      sentry() { }
+      void foo() { }
+    };
+  };
+  template class __declspec(dllimport) basic_ostream<char>;
+  // The explicit instantiation definition acts as an explicit instantiation
+  // *declaration*, dllimport is not inherited by the inner class, and no
+  // functions are emitted unless they are used.
+
+  USEMEMFUNC(basic_ostream<char>::sentry, foo);
+  // M32-DAG: define linkonce_odr x86_thiscallcc void @"\01?foo@sentry@?$basic_ostream@D@PR27810@@QAEXXZ"
+  // M32-NOT: ??0sentry@?$basic_ostream@D@PR27810@@QAE@XZ
+}
+
+namespace PR27811 {
+  template <class T> struct codecvt {
+    virtual ~codecvt() { }
+  };
+  template class __declspec(dllimport) codecvt<char>;
+
+  // dllimport means this explicit instantiation definition gets treated as a
+  // declaration. Thus, the vtable should not be marked used, and in fact
+  // nothing for this class should be emitted at all since it's not used.
+  // M32-NOT: codecvt
+}
+
 //===----------------------------------------------------------------------===//
 // Classes with template base classes
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGenCXX/duplicate-mangled-name.cpp b/test/CodeGenCXX/duplicate-mangled-name.cpp
index 104bb6eb4d62e..8c8f6e0311c3b 100644
--- a/test/CodeGenCXX/duplicate-mangled-name.cpp
+++ b/test/CodeGenCXX/duplicate-mangled-name.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST1
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST2
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST2 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST3
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm-only %s -verify -DTEST4
 
 #ifdef TEST1
 
@@ -14,28 +16,61 @@ extern "C" {
 
 #elif TEST2
 
-// We expect no warnings here, as there is only declaration of _ZN1TD1Ev function, no definitions.
+// expected-no-diagnostics
+
+// We expect no warnings here, as there is only declaration of _ZN1TD1Ev
+// function, no definitions.
 extern "C" void _ZN1TD1Ev();
 struct T {
   ~T() {}
 };
 
-void foo() {
+// We expect no warnings here, as there is only declaration of _ZN2nm3abcE
+// global, no definitions.
+extern "C" {
+  int _ZN2nm3abcE;
+}
+
+namespace nm {
+  float abc = 2;
+}
+// CHECK: @_ZN2nm3abcE = global float
+
+float foo() {
   _ZN1TD1Ev();
+// CHECK: call void bitcast ({{.*}} (%struct.T*)* @_ZN1TD1Ev to void ()*)()
   T t;
+// CHECK: call {{.*}} @_ZN1TD1Ev(%struct.T* %t)
+  return _ZN2nm3abcE + nm::abc;
 }
 
+#elif TEST3
+
 extern "C" void _ZN2T2D2Ev() {}; // expected-note {{previous definition is here}}
 
 struct T2 {
   ~T2() {} // expected-error {{definition with same mangled name as another definition}}
 };
 
-void bar() {
+void foo() {
   _ZN2T2D2Ev();
   T2 t;
 }
 
+#elif TEST4
+
+extern "C" {
+  int _ZN2nm3abcE = 1; // expected-note {{previous definition is here}}
+}
+
+namespace nm {
+  float abc = 2; // expected-error {{definition with same mangled name as another definition}}
+}
+
+float foo() {
+  return _ZN2nm3abcE + nm::abc;
+}
+
 #else
 
 #error Unknwon test
diff --git a/test/CodeGenCXX/eh.cpp b/test/CodeGenCXX/eh.cpp
index b44e8144bb27f..db0576a1baeb5 100644
--- a/test/CodeGenCXX/eh.cpp
+++ b/test/CodeGenCXX/eh.cpp
@@ -448,5 +448,27 @@ namespace test16 {
   }
 }
 
+namespace test17 {
+class BaseException {
+private:
+  int a[4];
+public:
+  BaseException() {};
+};
+
+class DerivedException: public BaseException {
+};
+
+int foo() {
+  throw DerivedException();
+  // The alignment passed to memset is 8, not 16, on Darwin.
+
+  // CHECK: [[T0:%.*]] = call i8* @__cxa_allocate_exception(i64 16)
+  // CHECK-NEXT: [[T1:%.*]] = bitcast i8* [[T0]] to %"class.test17::DerivedException"*
+  // CHECK-NEXT: [[T2:%.*]] = bitcast %"class.test17::DerivedException"* [[T1]] to i8*
+  // CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* [[T2]], i8 0, i64 16, i32 8, i1 false)
+}
+}
+
 // CHECK: attributes [[NUW]] = { nounwind }
 // CHECK: attributes [[NR]] = { noreturn }
diff --git a/test/CodeGenCXX/exceptions-cxx-ehsc.cpp b/test/CodeGenCXX/exceptions-cxx-ehsc.cpp
new file mode 100644
index 0000000000000..c660d145393a1
--- /dev/null
+++ b/test/CodeGenCXX/exceptions-cxx-ehsc.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -fexceptions -fcxx-exceptions -fexternc-nounwind | FileCheck %s
+
+namespace test1 {
+struct Cleanup { ~Cleanup(); };
+extern "C" void never_throws();
+void may_throw();
+
+void caller() {
+  Cleanup x;
+  never_throws();
+  may_throw();
+}
+}
+// CHECK-LABEL: define void @"\01?caller@test1@@YAXXZ"(
+// CHECK: call void @never_throws(
+// CHECK: invoke void @"\01?may_throw@test1@@YAXXZ"(
+
+namespace test2 {
+struct Cleanup { ~Cleanup(); };
+extern "C" void throws_int() throw(int);
+void may_throw();
+
+void caller() {
+  Cleanup x;
+  throws_int();
+  may_throw();
+}
+}
+// CHECK-LABEL: define void @"\01?caller@test2@@YAXXZ"(
+// CHECK: invoke void @throws_int(
+// CHECK: invoke void @"\01?may_throw@test2@@YAXXZ"(
diff --git a/test/CodeGenCXX/exceptions-cxx-new.cpp b/test/CodeGenCXX/exceptions-cxx-new.cpp
index 3767f3321c3e9..3329aea32ef20 100644
--- a/test/CodeGenCXX/exceptions-cxx-new.cpp
+++ b/test/CodeGenCXX/exceptions-cxx-new.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fexceptions -fcxx-exceptions -fnew-ms-eh -emit-llvm -o - -std=c++11 | FileCheck %s
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fms-extensions -fexceptions -fcxx-exceptions -emit-llvm -o - -std=c++11 | FileCheck %s
 
 int f(int);
 
@@ -72,6 +72,6 @@ void test_cleanup() {
 // CHECK:   ret void
 
 // CHECK: [[TERMINATE]]
-// CHECK:   cleanuppad within none []
-// CHECK-NEXT:   call void @"\01?terminate@@YAXXZ"()
+// CHECK:   %[[CLEANUPPAD:.*]] = cleanuppad within none []
+// CHECK-NEXT:   call void @"\01?terminate@@YAXXZ"() {{.*}} [ "funclet"(token %[[CLEANUPPAD]]) ]
 
diff --git a/test/CodeGenCXX/exceptions-seh-filter-captures.cpp b/test/CodeGenCXX/exceptions-seh-filter-captures.cpp
index 4e8be72089d5e..ab75a87698ab0 100644
--- a/test/CodeGenCXX/exceptions-seh-filter-captures.cpp
+++ b/test/CodeGenCXX/exceptions-seh-filter-captures.cpp
@@ -70,14 +70,14 @@ void test_lambda() {
   lambda();
 }
 
-// CHECK-LABEL: define internal void @"\01??R<lambda_0>@?test_lambda@@YAXXZ@QEBAXXZ"(%class.anon* %this)
+// CHECK-LABEL: define internal void @"\01??R<lambda_0>@?0??test_lambda@@YAXXZ@QEBA@XZ"(%class.anon* %this)
 // CHECK: @llvm.localescape(i32* %[[l2_addr:[^, ]*]])
 // CHECK: store i32 42, i32* %[[l2_addr]], align 4
 // CHECK: invoke void @might_crash()
 
-// CHECK-LABEL: define internal i32 @"\01?filt$0@0@?R<lambda_0>@?test_lambda@@YAXXZ@"(i8* %exception_pointers, i8* %frame_pointer)
-// CHECK: %[[fp:[^ ]*]] = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void (%class.anon*)* @"\01??R<lambda_0>@?test_lambda@@YAXXZ@QEBAXXZ" to i8*), i8* %frame_pointer)
-// CHECK: %[[l2_i8:[^ ]*]] = call i8* @llvm.localrecover(i8* bitcast (void (%class.anon*)* @"\01??R<lambda_0>@?test_lambda@@YAXXZ@QEBAXXZ" to i8*), i8* %[[fp]], i32 0)
+// CHECK-LABEL: define internal i32 @"\01?filt$0@0@?R<lambda_0>@?0??test_lambda@@YAXXZ@"(i8* %exception_pointers, i8* %frame_pointer)
+// CHECK: %[[fp:[^ ]*]] = call i8* @llvm.x86.seh.recoverfp(i8* bitcast (void (%class.anon*)* @"\01??R<lambda_0>@?0??test_lambda@@YAXXZ@QEBA@XZ" to i8*), i8* %frame_pointer)
+// CHECK: %[[l2_i8:[^ ]*]] = call i8* @llvm.localrecover(i8* bitcast (void (%class.anon*)* @"\01??R<lambda_0>@?0??test_lambda@@YAXXZ@QEBA@XZ" to i8*), i8* %[[fp]], i32 0)
 // CHECK: %[[l2_ptr:[^ ]*]] = bitcast i8* %[[l2_i8]] to i32*
 // CHECK: %[[l2:[^ ]*]] = load i32, i32* %[[l2_ptr]]
 // CHECK: call i32 (i32, ...) @basic_filter(i32 %[[l2]])
diff --git a/test/CodeGenCXX/exceptions-seh.cpp b/test/CodeGenCXX/exceptions-seh.cpp
index abbe95be3404e..589bc22657604 100644
--- a/test/CodeGenCXX/exceptions-seh.cpp
+++ b/test/CodeGenCXX/exceptions-seh.cpp
@@ -95,7 +95,7 @@ void use_seh_in_lambda() {
 // NOCXX-NOT: invoke
 // NOCXX: ret void
 
-// CHECK-LABEL: define internal void @"\01??R<lambda_0>@?use_seh_in_lambda@@YAXXZ@QEBAXXZ"(%class.anon* %this)
+// CHECK-LABEL: define internal void @"\01??R<lambda_0>@?0??use_seh_in_lambda@@YAXXZ@QEBA@XZ"(%class.anon* %this)
 // CXXEH-SAME:  personality i8* bitcast (i32 (...)* @__C_specific_handler to i8*)
 // CHECK: invoke void @might_throw() #[[NOINLINE]]
 // CHECK: catchpad
diff --git a/test/CodeGenCXX/exceptions.cpp b/test/CodeGenCXX/exceptions.cpp
index ff76b11350db7..86616d1e2c624 100644
--- a/test/CodeGenCXX/exceptions.cpp
+++ b/test/CodeGenCXX/exceptions.cpp
@@ -30,7 +30,7 @@ namespace test1 {
 
   A *a() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11aEv()
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 5)
     // CHECK:      ret [[A]]* [[CAST]]
@@ -40,7 +40,7 @@ namespace test1 {
 
   A *b() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11bEv()
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: [[FOO:%.*]] = invoke i32 @_ZN5test13fooEv()
     // CHECK:      invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 [[FOO]])
@@ -56,7 +56,7 @@ namespace test1 {
   A *c() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11cEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -82,7 +82,7 @@ namespace test1 {
   A *d() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11dEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -100,7 +100,7 @@ namespace test1 {
   A *e() {
     // CHECK:    define [[A:%.*]]* @_ZN5test11eEv()
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK-NEXT: [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK-NEXT: [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test11BC1Ev([[B:%.*]]* [[T0:%.*]])
@@ -131,7 +131,7 @@ namespace test1 {
     // CHECK:    define [[A:%.*]]* @_ZN5test11iEv()
     // CHECK:      [[X:%.*]] = alloca [[A]]*, align 8
     // CHECK:      [[ACTIVE:%.*]] = alloca i1
-    // CHECK:      [[NEW:%.*]] = call noalias i8* @_Znwm(i64 8)
+    // CHECK:      [[NEW:%.*]] = call i8* @_Znwm(i64 8)
     // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] 
     // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]*
     // CHECK-NEXT: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret [[T0:%.*]])
@@ -422,7 +422,7 @@ namespace test9 {
     return new A[10];
   }
   // CHECK: define {{%.*}}* @_ZN5test94testEv
-  // CHECK: [[TEST9_NEW:%.*]] = call noalias i8* @_Znam
+  // CHECK: [[TEST9_NEW:%.*]] = call i8* @_Znam
   // CHECK: call void @_ZdaPv(i8* [[TEST9_NEW]])
 }
 
diff --git a/test/CodeGenCXX/explicit-instantiation.cpp b/test/CodeGenCXX/explicit-instantiation.cpp
index 6076444c25b02..7e00d78e4834d 100644
--- a/test/CodeGenCXX/explicit-instantiation.cpp
+++ b/test/CodeGenCXX/explicit-instantiation.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm -triple i686-pc-linux-gnu -std=c++1y -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NO-OPT
 // RUN: %clang_cc1 -emit-llvm -triple i686-pc-linux-gnu -std=c++1y -O3 -disable-llvm-optzns -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-OPT
+// RUN: %clang_cc1 -emit-llvm -triple i686-pc-win32 -std=c++1y -o - %s | FileCheck %s --check-prefix=CHECK-MS
 
 // This check logically is attached to 'template int S<int>::i;' below.
 // CHECK: @_ZN1SIiE1iE = weak_odr global i32
@@ -103,6 +104,28 @@ int g() { return S<int>().f(); }
 template struct S<int>;
 }
 
+namespace NestedClasses {
+  // Check how explicit instantiation of an outer class affects the inner class.
+  template <typename T> struct Outer {
+    struct Inner {
+      void f() {}
+    };
+  };
+
+  // Explicit instantiation definition of Outer causes explicit instantiation
+  // definition of Inner.
+  template struct Outer<int>;
+  // CHECK: define weak_odr void @_ZN13NestedClasses5OuterIiE5Inner1fEv
+  // CHECK-MS: define weak_odr x86_thiscallcc void @"\01?f@Inner@?$Outer@H@NestedClasses@@QAEXXZ"
+
+  // Explicit instantiation declaration of Outer causes explicit instantiation
+  // declaration of Inner, but not in MSVC mode.
+  extern template struct Outer<char>;
+  auto use = &Outer<char>::Inner::f;
+  // CHECK: {{declare|define available_externally}} void @_ZN13NestedClasses5OuterIcE5Inner1fEv
+  // CHECK-MS: define linkonce_odr x86_thiscallcc void @"\01?f@Inner@?$Outer@D@NestedClasses@@QAEXXZ"
+}
+
 // Check that we emit definitions from explicit instantiations even when they
 // occur prior to the definition itself.
 template <typename T> struct S {
diff --git a/test/CodeGenCXX/extern-c.cpp b/test/CodeGenCXX/extern-c.cpp
index 5b59a38ba0d79..1046915fcaa2c 100644
--- a/test/CodeGenCXX/extern-c.cpp
+++ b/test/CodeGenCXX/extern-c.cpp
@@ -16,8 +16,23 @@ extern "C" struct d;
 // CHECK-NOT: should_not_appear
 extern "C++" int should_not_appear;
 
+// CHECK: @_ZN3foo10extern_cxxE = global
+extern "C++" int extern_cxx = 0;
+
 }
 
+// CHECK-NOT: @global_a = global
+extern "C" int global_a;
+
+// CHECK: @global_b = global
+extern "C" int global_b = 0;
+
+// CHECK-NOT: should_not_appear
+extern "C++" int should_not_appear;
+
+// CHECK: @extern_cxx = global
+extern "C++" int extern_cxx = 0;
+
 namespace test1 {
   namespace {
     struct X {};
@@ -59,10 +74,10 @@ extern "C" {
 
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
-  // CHECK: @internal_var = internal alias i32, i32* @_Z12internal_var
+  // CHECK: @internal_var = internal alias i32, i32* @_ZL12internal_var
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
-  // CHECK: @internal_fn = internal alias i32 (), i32 ()* @_Z11internal_fnv
+  // CHECK: @internal_fn = internal alias i32 (), i32 ()* @_ZL11internal_fnv
   // CHECK-NOT: @unused
   // CHECK-NOT: @duplicate_internal
 }
diff --git a/test/CodeGenCXX/float128-declarations.cpp b/test/CodeGenCXX/float128-declarations.cpp
new file mode 100644
index 0000000000000..e1604a61cac7c
--- /dev/null
+++ b/test/CodeGenCXX/float128-declarations.cpp
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -emit-llvm -triple powerpc64-unknown-unknown \
+// RUN:   -target-feature +float128 -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple powerpc64le-unknown-unknown \
+// RUN:   -target-feature +float128 -std=c++11 %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -triple i386-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
+// RUN: %clang_cc1 -emit-llvm -triple x86_64-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-X86
+// RUN: %clang_cc1 -emit-llvm -triple systemz-unknown-linux-gnu -std=c++11 \
+// RUN:   %s -o - | FileCheck %s -check-prefix=CHECK-SYSZ
+//
+/*  Various contexts where type __float128 can appear. The different check
+    prefixes are due to different mangling on X86 and different calling
+    convention on SystemZ. */
+
+/*  Namespace */
+namespace {
+  __float128 f1n;
+  __float128 f2n = 33.q;
+  __float128 arr1n[10];
+  __float128 arr2n[] = { 1.2q, 3.0q, 3.e11q };
+  const volatile __float128 func1n(const __float128 &arg) {
+    return arg + f2n + arr1n[4] - arr2n[1];
+  }
+}
+
+/* File */
+__float128 f1f;
+__float128 f2f = 32.4q;
+static __float128 f3f = f2f;
+__float128 arr1f[10];
+__float128 arr2f[] = { -1.2q, -3.0q, -3.e11q };
+__float128 func1f(__float128 arg);
+
+/* Class */
+class C1 {
+  __float128 f1c;
+  static const __float128 f2c;
+  volatile __float128 f3c;
+public:
+  C1(__float128 arg) : f1c(arg), f3c(arg) { }
+  __float128 func1c(__float128 arg ) {
+    return f1c + arg;
+  }
+  static __float128 func2c(__float128 arg) {
+    return arg * C1::f2c;
+  }
+};
+
+/*  Template */
+template <class C> C func1t(C arg) { return arg * 2.q; }
+template <class C> struct S1 {
+  C mem1;
+};
+template <> struct S1<__float128> {
+  __float128 mem2;
+};
+
+/* Local */
+int main(void) {
+  __float128 f1l = 123e220q;
+  __float128 f2l = -0.q;
+  __float128 f3l = 1.189731495357231765085759326628007e4932q;
+  C1 c1(f1l);
+  S1<__float128> s1 = { 132.q };
+  __float128 f4l = func1n(f1l) + func1f(f2l) + c1.func1c(f3l) + c1.func2c(f1l) +
+    func1t(f1l) + s1.mem2 - f1n + f2n;
+#if (__cplusplus >= 201103L)
+  auto f5l = -1.q, *f6l = &f2l, f7l = func1t(f3l);
+#endif
+  __float128 f8l = f4l++;
+  __float128 arr1l[] = { -1.q, -0.q, -11.q };
+}
+// CHECK-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-DAG: define internal fp128 @_ZN12_GLOBAL__N_16func1nERKU10__float128(fp128*
+// CHECK-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-DAG: @arr1f = global [10 x fp128]
+// CHECK-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-DAG: declare fp128 @_Z6func1fU10__float128(fp128)
+// CHECK-DAG: define linkonce_odr void @_ZN2C1C2EU10__float128(%class.C1* %this, fp128 %arg)
+// CHECK-DAG: define linkonce_odr fp128 @_ZN2C16func2cEU10__float128(fp128 %arg)
+// CHECK-DAG: define linkonce_odr fp128 @_Z6func1tIU10__float128ET_S0_(fp128 %arg)
+// CHECK-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-DAG: store fp128 [[INC]], fp128* %f4l
+
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-X86-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-X86-DAG: define internal fp128 @_ZN12_GLOBAL__N_16func1nERKg(fp128*
+// CHECK-X86-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-X86-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-X86-DAG: @arr1f = global [10 x fp128]
+// CHECK-X86-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-X86-DAG: declare fp128 @_Z6func1fg(fp128)
+// CHECK-X86-DAG: define linkonce_odr void @_ZN2C1C2Eg(%class.C1* %this, fp128 %arg)
+// CHECK-X86-DAG: define linkonce_odr fp128 @_ZN2C16func2cEg(fp128 %arg)
+// CHECK-X86-DAG: define linkonce_odr fp128 @_Z6func1tIgET_S0_(fp128 %arg)
+// CHECK-X86-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-X86-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-X86-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-X86-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-X86-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-X86-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-X86-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-X86-DAG: store fp128 [[INC]], fp128* %f4l
+
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_13f1nE = internal global fp128 0xL00000000000000000000000000000000
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_13f2nE = internal global fp128 0xL00000000000000004004080000000000
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_15arr1nE = internal global [10 x fp128]
+// CHECK-SYSZ-DAG: @_ZN12_GLOBAL__N_15arr2nE = internal global [3 x fp128] [fp128 0xL33333333333333333FFF333333333333, fp128 0xL00000000000000004000800000000000, fp128 0xL00000000000000004025176592E00000]
+// CHECK-SYSZ-DAG: define internal void @_ZN12_GLOBAL__N_16func1nERKU10__float128(fp128*
+// CHECK-SYSZ-DAG: @f1f = global fp128 0xL00000000000000000000000000000000
+// CHECK-SYSZ-DAG: @f2f = global fp128 0xL33333333333333334004033333333333
+// CHECK-SYSZ-DAG: @arr1f = global [10 x fp128]
+// CHECK-SYSZ-DAG: @arr2f = global [3 x fp128] [fp128 0xL3333333333333333BFFF333333333333, fp128 0xL0000000000000000C000800000000000, fp128 0xL0000000000000000C025176592E00000]
+// CHECK-SYSZ-DAG: declare void @_Z6func1fU10__float128(fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_ZN2C1C2EU10__float128(%class.C1* %this, fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_ZN2C16func2cEU10__float128(fp128*
+// CHECK-SYSZ-DAG: define linkonce_odr void @_Z6func1tIU10__float128ET_S0_(fp128*
+// CHECK-SYSZ-DAG: @_ZZ4mainE2s1 = private unnamed_addr constant %struct.S1 { fp128 0xL00000000000000004006080000000000 }
+// CHECK-SYSZ-DAG: store fp128 0xLF0AFD0EBFF292DCE42E0B38CDD83F26F, fp128* %f1l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xL00000000000000008000000000000000, fp128* %f2l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xLFFFFFFFFFFFFFFFF7FFEFFFFFFFFFFFF, fp128* %f3l, align 16
+// CHECK-SYSZ-DAG: store fp128 0xL0000000000000000BFFF000000000000, fp128* %f5l, align 16
+// CHECK-SYSZ-DAG: [[F4L:%[a-z0-9]+]] = load fp128, fp128* %f4l
+// CHECK-SYSZ-DAG: [[INC:%[a-z0-9]+]] = fadd fp128 [[F4L]], 0xL00000000000000003FFF000000000000
+// CHECK-SYSZ-DAG: store fp128 [[INC]], fp128* %f4l
diff --git a/test/CodeGenCXX/goto.cpp b/test/CodeGenCXX/goto.cpp
index c1a0eeccf8113..27bd7affbac9b 100644
--- a/test/CodeGenCXX/goto.cpp
+++ b/test/CodeGenCXX/goto.cpp
@@ -18,7 +18,7 @@ namespace test0 {
     // CHECK-NEXT: [[CLEANUPACTIVE:%.*]] = alloca i1
     // CHECK:      call void @_ZN5test01AC1Ev([[A]]* [[Y]])
     // CHECK-NEXT: invoke void @_ZN5test01AC1Ev([[A]]* [[Z]])
-    // CHECK:      [[NEW:%.*]] = invoke noalias i8* @_Znwm(i64 1)
+    // CHECK:      [[NEW:%.*]] = invoke i8* @_Znwm(i64 1)
     // CHECK:      store i1 true, i1* [[CLEANUPACTIVE]]
     // CHECK:      [[NEWCAST:%.*]] = bitcast i8* [[NEW]] to [[V]]*
     // CHECK-NEXT: invoke void @_ZN5test01AC1Ev([[A]]* [[TMP]])
diff --git a/test/CodeGenCXX/inheriting-constructor.cpp b/test/CodeGenCXX/inheriting-constructor.cpp
index 42080a2dafa94..a3adf70ebb188 100644
--- a/test/CodeGenCXX/inheriting-constructor.cpp
+++ b/test/CodeGenCXX/inheriting-constructor.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_cc1 -std=c++11 -triple x86_64-apple-darwin10 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -triple i386-linux -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-darwin -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple arm64-ehabi -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=ITANIUM
+// RUN: %clang_cc1 -std=c++11 -triple i386-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI --check-prefix=WIN32
+// RUN: %clang_cc1 -std=c++11 -triple x86_64-windows -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=MSABI --check-prefix=WIN64
 
 // PR12219
 struct A { A(int); virtual ~A(); };
@@ -11,18 +15,396 @@ struct C { template<typename T> C(T); };
 struct D : C { using C::C; };
 D d(123);
 
-// CHECK-LABEL: define void @_ZN1BD2Ev
-// CHECK-LABEL: define void @_ZN1BD1Ev
-// CHECK-LABEL: define void @_ZN1BD0Ev
+// ITANIUM-LABEL: define void @_ZN1BD2Ev
+// ITANIUM-LABEL: define void @_ZN1BD1Ev
+// ITANIUM-LABEL: define void @_ZN1BD0Ev
+// WIN32-LABEL: define {{.*}}void @"\01??1B@@UAE@XZ"
+// WIN64-LABEL: define {{.*}}void @"\01??1B@@UEAA@XZ"
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1BC1Ei(
-// CHECK: call void @_ZN1BC2Ei(
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1BCI11AEi(
+// ITANIUM: call void @_ZN1BCI21AEi(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1DC1IiEET_(
-// CHECK: call void @_ZN1DC2IiEET_(
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1DCI11CIiEET_(
+// ITANIUM: call void @_ZN1DCI21CIiEET_(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ei(
-// CHECK: call void @_ZN1AC2Ei(
+// WIN32-LABEL: define internal {{.*}} @"\01??0B@@QAE@H@Z"(
+// WIN32: call {{.*}} @"\01??0A@@QAE@H@Z"(
+// WIN64-LABEL: define internal {{.*}} @"\01??0B@@QEAA@H@Z"(
+// WIN64: call {{.*}} @"\01??0A@@QEAA@H@Z"(
 
-// CHECK-LABEL: define linkonce_odr void @_ZN1DC2IiEET_(
-// CHECK: call void @_ZN1CC2IiEET_(
+// WIN32-LABEL: define internal {{.*}} @"\01??0D@@QAE@H@Z"(
+// WIN32: call {{.*}} @"\01??$?0H@C@@QAE@H@Z"
+// WIN64-LABEL: define internal {{.*}} @"\01??0D@@QEAA@H@Z"(
+// WIN64: call {{.*}} @"\01??$?0H@C@@QEAA@H@Z"
+
+struct Q { Q(int); Q(const Q&); ~Q(); };
+struct Z { Z(); Z(int); ~Z(); int n; };
+
+namespace noninline_nonvirt {
+  struct A { A(int, Q&&, void *__attribute__((pass_object_size(0)))); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, &b);
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}} %[[TMP:.*]], i32 2)
+  // ITANIUM: call void @_ZN17noninline_nonvirt1BCI1NS_1AEEiO1QPvU17pass_object_size0({{.*}} @_ZN17noninline_nonvirt1bE, i32 1, {{.*}} %[[TMP]], i8* {{.*}} @_ZN17noninline_nonvirt1bE{{.*}}, i{{32|64}} 12)
+  // ITANIUM: call void @_ZN1QD1Ev({{.*}} %[[TMP]])
+  // ITANIUM: call i32 @__cxa_atexit(
+
+  // Complete object ctor for B delegates to base object ctor.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1BCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0({{.*}}, i32 {{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} {{.*}})
+
+  // In MSABI, we don't have ctor variants. B ctor forwards to A ctor.
+  // MSABI-LABEL: define internal {{.*}} @"\01??0B@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+  // MSABI: call {{.*}} @"\01??0A@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+
+  struct C : B { using B::B; };
+  C c(1, 2, &c);
+  // Complete object ctor for C delegates.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1CCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN17noninline_nonvirt1CCI2NS_1AEEiO1QPvU17pass_object_size0({{.*}}, i32 {{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} {{.*}})
+
+  // MSABI-LABEL: define internal {{.*}} @"\01??0C@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: call {{.*}} @"\01??0B@noninline_nonvirt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+}
+
+namespace noninline_virt {
+  struct A { A(int, Q&&, void *__attribute__((pass_object_size(0)))); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, &b);
+  // Complete object ctor forwards to A ctor then constructs Zs.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN14noninline_virt1BCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN14noninline_virt1AC2EiO1QPvU17pass_object_size0({{.*}} %{{.*}}, i32 %{{.*}}, %{{.*}}* {{.*}}, i8* {{.*}}, i{{32|64}} %{{.*}}
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: store {{.*}} @_ZTVN14noninline_virt1BE
+  // ITANIUM: call void @_ZN1ZC1Ev(
+
+  // MSABI-LABEL: define internal {{.*}} @"\01??0B@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 %{{.*}})
+  // MSABI: %[[COMPLETE:.*]] = icmp ne
+  // MSABI: br i1 %[[COMPLETE]],
+  // MSABI: call {{.*}} @"\01??0A@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: br
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+  // MSABI: call {{.*}} @"\01??0Z@@Q{{AE|EAA}}@XZ"(
+
+  struct C : B { using B::B; };
+  C c(1, 2, &c);
+  // Complete object ctor forwards to A ctor, then calls B's base inheriting
+  // constructor, which takes no arguments other than the this pointer and VTT.
+  // ITANIUM_LABEL: define linkonce_odr void @_ZN14noninline_virt1CCI1NS_1AEEiO1QPvU17pass_object_size0(
+  // ITANIUM: call void @_ZN14noninline_virt1AC2EiO1QPvU17pass_object_size0({{.*}} %{{.*}}, i32 %{{.*}}, %{{.*}}* {{.*}}, i8* %{{.*}}, i{{32|64}} %{{.*}})
+  // ITANIUM: call void @_ZN14noninline_virt1BCI2NS_1AEEiO1QPvU17pass_object_size0(%{{.*}}* %{{.*}}, i8** getelementptr inbounds ([2 x i8*], [2 x i8*]* @_ZTTN14noninline_virt1CE, i64 0, i64 1))
+  // ITANIUM: store {{.*}} @_ZTVN14noninline_virt1CE
+
+  // C constructor forwards to B constructor and A constructor. We pass the args
+  // to both. FIXME: Can we pass undef here instead, for the base object
+  // constructor call?
+  // MSABI-LABEL: define internal {{.*}} @"\01??0C@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 %{{.*}})
+  // MSABI: %[[COMPLETE:.*]] = icmp ne
+  // MSABI: br i1 %[[COMPLETE]],
+  // MSABI: call {{.*}} @"\01??0A@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}})
+  // MSABI: br
+  // MSABI: call {{.*}} @"\01??0B@noninline_virt@@Q{{AE|EAA}}@H$$Q{{E?}}AUQ@@P{{E?}}AXW4__pass_object_size0@__clang@@@Z"(%{{.*}}, i32{{.*}}, %{{.*}}, i8*{{.*}}, i{{32|64}}{{.*}}, i32 0)
+}
+
+// For MSABI only, check that inalloca arguments result in inlining.
+namespace inalloca_nonvirt {
+  struct A { A(Q, int, Q, Q&&); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, 3, 4);
+  // No inlining implied for Itanium.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN16inalloca_nonvirt1BCI1NS_1AEE1QiS1_OS1_(
+  // ITANIUM: call void @_ZN16inalloca_nonvirt1BCI2NS_1AEE1QiS1_OS1_(
+
+  // MSABI-LABEL: define internal void @"\01??__Eb@inalloca_nonvirt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_nonvirt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4);
+  // MSABI-LABEL: define internal void @"\01??__Ec@inalloca_nonvirt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_nonvirt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0A@inalloca_nonvirt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+}
+
+namespace inalloca_virt {
+  struct A { A(Q, int, Q, Q&&); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, 3, 4);
+
+  // MSABI-LABEL: define internal void @"\01??__Eb@inalloca_virt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // FIXME: It's dumb to round-trip this though memory and generate a branch.
+  // WIN32: store i32 1, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // WIN32: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_virt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: br
+  //
+  // Note that if we jumped directly to here we would fail to stackrestore and
+  // destroy the parameters, but that's not actually possible.
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: br i1
+  // WIN64: call {{.*}} @"\01??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: br
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4);
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN13inalloca_virt1CD1Ev(
+
+  // MSABI-LABEL: define internal void @"\01??__Ec@inalloca_virt@@YAXXZ"(
+
+  // On Win32, the inalloca call can't be forwarded so we force inlining.
+  // WIN32: %[[TMP:.*]] = alloca
+  // WIN32: call i8* @llvm.stacksave()
+  // WIN32: %[[ARGMEM:.*]] = alloca inalloca
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"(%{{.*}}* %[[TMP]], i32 4)
+  // WIN32: %[[ARG3:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN32: %[[ARG1:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: call {{.*}} @"\01??0Q@@QAE@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN32: store i32 1, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // WIN32: store {{.*}} @"\01??_8C@inalloca_virt@@7B@"
+  // WIN32: %[[ARG2:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store i32 2, i32* %[[ARG2]]
+  // WIN32: %[[ARG4:.*]] = getelementptr {{.*}} %[[ARGMEM]]
+  // WIN32: store {{.*}}* %[[TMP]], {{.*}}** %[[ARG4]]
+  // WIN32: call {{.*}} @"\01??0A@inalloca_virt@@QAE@UQ@@H0$$QAU2@@Z"(%{{[^,]*}}, <{{.*}}>* inalloca %[[ARGMEM]])
+  // WIN32: call void @llvm.stackrestore(
+  // WIN32: br
+  //
+  // WIN32: store i32 0, i32* %[[IS_MOST_DERIVED_ADDR:.*]]
+  // WIN32: %[[IS_MOST_DERIVED:.*]] = load i32, i32* %[[IS_MOST_DERIVED_ADDR]]
+  // WIN32: %[[IS_MOST_DERIVED_i1:.*]] = icmp ne i32 %[[IS_MOST_DERIVED]], 0
+  // WIN32: br i1 %[[IS_MOST_DERIVED_i1]]
+  //
+  // Note: this block is unreachable.
+  // WIN32: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN32: br
+  //
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??0Z@@QAE@XZ"(
+  // WIN32: call {{.*}} @"\01??_DQ@@QAE@XZ"(
+
+  // On Win64, the Q arguments would be destroyed in the callee. We don't yet
+  // support that in the non-inlined case, so we force inlining.
+  // WIN64: %[[TMP:.*]] = alloca
+  // WIN64: %[[ARG3:.*]] = alloca
+  // WIN64: %[[ARG1:.*]] = alloca
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[TMP]], i32 4)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG3]], i32 3)
+  // WIN64: call {{.*}} @"\01??0Q@@QEAA@H@Z"({{.*}}* %[[ARG1]], i32 1)
+  // WIN64: br i1
+  // WIN64: store {{.*}} @"\01??_8C@inalloca_virt@@7B@"
+  // WIN64: call {{.*}} @"\01??0A@inalloca_virt@@QEAA@UQ@@H0$$QEAU2@@Z"(%{{.*}}, %{{.*}}* %[[ARG1]], i32 2, %{{.*}}* %[[ARG3]], %{{.*}} %[[TMP]])
+  // WIN64: br
+  // WIN64: br i1
+  // (Unreachable block)
+  // WIN64: store {{.*}} @"\01??_8B@inalloca_virt@@7B@"
+  // WIN64: br
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call {{.*}} @"\01??0Z@@QEAA@XZ"(
+  // WIN64: call void @"\01??_DQ@@QEAA@XZ"({{.*}}* %[[TMP]])
+}
+
+namespace inline_nonvirt {
+  struct A { A(Q, int, Q, Q&&, ...); int n; };
+  struct B : Z, A { Z z; using A::A; };
+  B b(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[Z_BASE:.*]] = bitcast %{{.*}}* %[[THIS:.*]] to
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} 4
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN14inline_nonvirt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: %[[Z_MEMBER:.*]] = getelementptr {{.*}} %[[THIS]], i32 0, i32 2
+  // ITANIUM: call void @_ZN1ZC1Ev({{.*}} %[[Z_MEMBER]])
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[Z_BASE:.*]] = bitcast %{{.*}}* %[[THIS:.*]] to
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} 4
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN14inline_nonvirt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: %[[Z_MEMBER:.*]] = getelementptr {{.*}} %{{.*}}, i32 0, i32 2
+  // ITANIUM: call void @_ZN1ZC1Ev({{.*}} %[[Z_MEMBER]])
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+}
+
+namespace inline_virt {
+  struct A { A(Q, int, Q, Q&&, ...); int n; };
+  struct B : Z, virtual A { Z z; using A::A; };
+  B b(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor.
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS:.*]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} {{12|16}}
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN11inline_virt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM: call void @_ZN1ZC1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  struct C : B { using B::B; };
+  C c(1, 2, 3, 4, 5, 6);
+  // Inlined all the way down to the A ctor, except that we can just call the
+  // B base inheriting constructor to construct that portion (it doesn't need
+  // the forwarded arguments).
+  // ITANIUM-LABEL: define {{.*}} @__cxx_global_var_init
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 1)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 3)
+  // ITANIUM: call void @_ZN1QC1Ei({{.*}}, i32 4)
+  // ITANIUM: %[[B_CAST:.*]] = bitcast {{.*}} %[[THIS:.*]]
+  // ITANIUM: %[[A_CAST:.*]] = getelementptr {{.*}} %[[B_CAST]], i{{32|64}} {{12|16}}
+  // ITANIUM: %[[A:.*]] = bitcast {{.*}} %[[A_CAST]]
+  // ITANIUM: call void ({{.*}}, ...) @_ZN11inline_virt1AC2E1QiS1_OS1_z(%{{.*}}* %[[A]], {{.*}}, i32 2, {{.*}}, {{.*}}, i32 5, i32 6)
+  // ITANIUM: call void @_ZN11inline_virt1BCI2NS_1AEE1QiS1_OS1_z({{[^,]*}}, i8** getelementptr inbounds ([2 x i8*], [2 x i8*]* @_ZTTN11inline_virt1CE, i64 0, i64 1))
+  // ITANIUM: store {{.*}} @_ZTVN11inline_virt1CE
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+  // ITANIUM: call void @_ZN1QD1Ev(
+
+  // B base object inheriting constructor does not get passed arguments.
+  // ITANIUM-LABEL: define linkonce_odr void @_ZN11inline_virt1BCI2NS_1AEE1QiS1_OS1_z(
+  // ITANIUM-NOT: call
+  // ITANIUM: call void @_ZN1ZC2Ev(
+  // ITANIUM-NOT: call
+  // VTT -> vtable
+  // ITANIUM: store
+  // ITANIUM-NOT: call
+  // ITANIUM: call void @_ZN1ZC1Ev(
+  // ITANIUM-NOT: call
+  // ITANIUM: }
+}
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1BCI21AEi(
+// ITANIUM: call void @_ZN1AC2Ei(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN1DCI21CIiEET_(
+// ITANIUM: call void @_ZN1CC2IiEET_(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0(
+// ITANIUM: call void @_ZN1ZC2Ev(
+// ITANIUM: call void @_ZN17noninline_nonvirt1AC2EiO1QPvU17pass_object_size0(
+
+// ITANIUM-LABEL: define linkonce_odr void @_ZN17noninline_nonvirt1CCI2NS_1AEEiO1QPvU17pass_object_size0(
+// ITANIUM: call void @_ZN17noninline_nonvirt1BCI2NS_1AEEiO1QPvU17pass_object_size0(
diff --git a/test/CodeGenCXX/init-invariant.cpp b/test/CodeGenCXX/init-invariant.cpp
index 7f348257f2630..71eb7dd73492e 100644
--- a/test/CodeGenCXX/init-invariant.cpp
+++ b/test/CodeGenCXX/init-invariant.cpp
@@ -56,5 +56,5 @@ void e() {
 
 // CHECK-LABEL: define void @_Z1ev(
 // CHECK: call void @_ZN1AC1Ev(%struct.A* nonnull @_ZZ1evE1a)
-// CHECK: call {{.*}}@llvm.invariant.start(i64 4, i8* nonnull bitcast ({{.*}} @_ZZ1evE1a to i8*))
+// CHECK: call {{.*}}@llvm.invariant.start(i64 4, i8* {{.*}}bitcast ({{.*}} @_ZZ1evE1a to i8*))
 // CHECK-NOT: llvm.invariant.end
diff --git a/test/CodeGenCXX/inline-hint.cpp b/test/CodeGenCXX/inline-hint.cpp
new file mode 100644
index 0000000000000..9c14032f9f698
--- /dev/null
+++ b/test/CodeGenCXX/inline-hint.cpp
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -finline-functions -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=SUITABLE
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -finline-hint-functions -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=HINTED
+// RUN: %clang_cc1 %s -std=c++11 -triple=x86_64-linux -fno-inline -emit-llvm -disable-llvm-optzns -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NOINLINE
+
+// Force non-trivial implicit constructors/destructors/operators for B by having explicit ones for A
+struct A {
+  A() {}
+  A(const A&) {}
+  A& operator=(const A&) { return *this; }
+  ~A() {}
+};
+
+struct B {
+  A member;
+  int implicitFunction(int a) { return a + a; }
+  inline int explicitFunction(int a);
+  int noHintFunction(int a);
+  __attribute__((optnone)) int optNoneFunction(int a) { return a + a; }
+  template<int N> int implicitTplFunction(int a) { return N + a; }
+  template<int N> inline int explicitTplFunction(int a) { return N + a; }
+  template<int N> int noHintTplFunction(int a);
+  template<int N> int explicitRedeclTplFunction(int a);
+};
+
+int B::explicitFunction(int a) { return a + a; }
+// CHECK: @_ZN1B14noHintFunctionEi({{.*}}) [[NOHINT_ATTR:#[0-9]+]]
+int B::noHintFunction(int a) { return a + a; }
+
+// CHECK: @_ZN1B19implicitTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::implicitTplFunction<0>(int a) { return a + a; }
+// CHECK: @_ZN1B19explicitTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::explicitTplFunction<0>(int a) { return a + a; }
+// CHECK: @_ZN1B17noHintTplFunctionILi0EEEii({{.*}}) [[NOHINT_ATTR]]
+template<> int B::noHintTplFunction<0>(int a) { return a + a; }
+template<> inline int B::implicitTplFunction<1>(int a) { return a; }
+template<> inline int B::explicitTplFunction<1>(int a) { return a; }
+template<> inline int B::noHintTplFunction<1>(int a) { return a; }
+template<int N> int B::noHintTplFunction(int a) { return N + a; }
+template<int N> inline int B::explicitRedeclTplFunction(int a) { return N + a; }
+
+constexpr int constexprFunction(int a) { return a + a; }
+
+void foo()
+{
+// CHECK: @_ZN1BC1Ev({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR:#[0-9]+]]
+  B b1;
+// CHECK: @_ZN1BC1ERKS_({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR]]
+  B b2(b1);
+// CHECK: @_ZN1BaSERKS_({{.*}}) [[IMPLICIT_CONSTR_ATTR]]
+  b2 = b1;
+// CHECK: @_ZN1B16implicitFunctionEi({{.*}}) [[IMPLICIT_ATTR:#[0-9]+]]
+  b1.implicitFunction(1);
+// CHECK: @_ZN1B16explicitFunctionEi({{.*}}) [[EXPLICIT_ATTR:#[0-9]+]]
+  b1.explicitFunction(2);
+  b1.noHintFunction(3);
+// CHECK: @_ZN1B15optNoneFunctionEi({{.*}}) [[OPTNONE_ATTR:#[0-9]+]]
+  b1.optNoneFunction(4);
+// CHECK: @_Z17constexprFunctioni({{.*}}) [[IMPLICIT_ATTR]]
+  constexprFunction(5);
+  b1.implicitTplFunction<0>(6);
+// CHECK: @_ZN1B19implicitTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.implicitTplFunction<1>(7);
+// CHECK: @_ZN1B19implicitTplFunctionILi2EEEii({{.*}}) [[IMPLICIT_ATTR]]
+  b1.implicitTplFunction<2>(8);
+  b1.explicitTplFunction<0>(9);
+// CHECK: @_ZN1B19explicitTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitTplFunction<1>(10);
+// CHECK: @_ZN1B19explicitTplFunctionILi2EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitTplFunction<2>(11);
+  b1.noHintTplFunction<0>(12);
+// CHECK: @_ZN1B17noHintTplFunctionILi1EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.noHintTplFunction<1>(13);
+// CHECK: @_ZN1B17noHintTplFunctionILi2EEEii({{.*}}) [[NOHINT_ATTR]]
+  b1.noHintTplFunction<2>(14);
+// CHECK: @_ZN1B25explicitRedeclTplFunctionILi2EEEii({{.*}}) [[EXPLICIT_ATTR]]
+  b1.explicitRedeclTplFunction<2>(15);
+// CHECK: @_ZN1BD2Ev({{.*}}) unnamed_addr [[IMPLICIT_CONSTR_ATTR]]
+}
+
+// SUITABLE-NOT: attributes [[NOHINT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-DAG: attributes [[NOHINT_ATTR]] = { noinline{{.*}} }
+// NOINLINE-DAG: attributes [[NOHINT_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[IMPLICIT_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[IMPLICIT_CONSTR_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[IMPLICIT_CONSTR_ATTR]] = { noinline{{.*}} }
+
+// SUITABLE-NOT: attributes [[EXPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+//   HINTED-NOT: attributes [[IMPLICIT_ATTR]] = { {{.*}}noinline{{.*}} }
+// NOINLINE-DAG: attributes [[EXPLICIT_ATTR]] = { noinline{{.*}} }
+
+// CHECK-DAG: attributes [[OPTNONE_ATTR]] = { noinline{{.*}} }
diff --git a/test/CodeGenCXX/lambda-expressions.cpp b/test/CodeGenCXX/lambda-expressions.cpp
index 4df44f4c5f7f8..f59d360314e4b 100644
--- a/test/CodeGenCXX/lambda-expressions.cpp
+++ b/test/CodeGenCXX/lambda-expressions.cpp
@@ -12,12 +12,16 @@ extern "C" auto cvar = []{};
 
 // CHECK-LABEL: define i32 @_Z9ARBSizeOfi(i32
 int ARBSizeOf(int n) {
-  typedef double (T)[8][n];
-  using TT = double [8][n];
+  typedef double(T)[8][n];
+  using TT = double[8][n];
   return [&]() -> int {
     typedef double(T1)[8][n];
     using TT1 = double[8][n];
-    return sizeof(T) + sizeof(T1) + sizeof(TT) + sizeof(TT1);
+    return [&n]() -> int {
+      typedef double(T2)[8][n];
+      using TT2 = double[8][n];
+      return sizeof(T) + sizeof(T1) + sizeof(T2) + sizeof(TT) + sizeof(TT1) + sizeof(TT2);
+    }();
   }();
 }
 
diff --git a/test/CodeGenCXX/linetable-virtual-variadic.cpp b/test/CodeGenCXX/linetable-virtual-variadic.cpp
index 8d1bf47814f86..6f966416867a7 100644
--- a/test/CodeGenCXX/linetable-virtual-variadic.cpp
+++ b/test/CodeGenCXX/linetable-virtual-variadic.cpp
@@ -15,9 +15,5 @@ void Derived::VariadicFunction(...) { }
 // CHECK-LABEL: define void @_ZT{{.+}}N7Derived16VariadicFunctionEz(
 // CHECK: ret void, !dbg ![[LOC:[0-9]+]]
 //
-// CHECK: !llvm.dbg.cu = !{![[CU:[0-9]+]]}
-//
-// CHECK: ![[CU]] = distinct !DICompileUnit({{.*}} subprograms: ![[SPs:[0-9]+]]
-// CHECK: ![[SPs]] = !{![[SP]]}
 // CHECK: ![[SP]] = distinct !DISubprogram(name: "VariadicFunction"
 // CHECK: ![[LOC]] = !DILocation({{.*}}scope: ![[SP]])
diff --git a/test/CodeGenCXX/lto-visibility-inference.cpp b/test/CodeGenCXX/lto-visibility-inference.cpp
new file mode 100644
index 0000000000000..8e57ef5e0b892
--- /dev/null
+++ b/test/CodeGenCXX/lto-visibility-inference.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -std=c++11 -fms-extensions -fvisibility hidden -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -std=c++11 -fms-extensions -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=MS --check-prefix=MS-STD %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -std=c++11 -fms-extensions -fwhole-program-vtables -flto-visibility-public-std -emit-llvm -o - %s | FileCheck --check-prefix=MS --check-prefix=MS-NOSTD %s
+
+struct C1 {
+  virtual void f();
+};
+
+struct __attribute__((visibility("default"))) C2 {
+  virtual void f();
+};
+
+struct __declspec(dllexport) C3 {
+  virtual void f();
+};
+
+struct __declspec(dllimport) C4 {
+  virtual void f();
+};
+
+struct [[clang::lto_visibility_public]] C5 {
+  virtual void f();
+};
+
+struct __declspec(uuid("00000000-0000-0000-0000-000000000000")) C6 {
+  virtual void f();
+};
+
+namespace std {
+
+struct C7 {
+  virtual void f();
+  struct C8 {
+    virtual void f();
+  };
+};
+
+}
+
+extern "C++" {
+
+namespace stdext {
+
+struct C9 {
+  virtual void f();
+};
+
+}
+
+}
+
+namespace other {
+
+struct C10 {
+  virtual void f();
+};
+
+}
+
+namespace {
+
+struct C11 {
+  virtual void f();
+};
+
+}
+
+void f(C1 *c1, C2 *c2, C3 *c3, C4 *c4, C5 *c5, C6 *c6, std::C7 *c7,
+       std::C7::C8 *c8, stdext::C9 *c9, other::C10 *c10) {
+  // ITANIUM: type.test{{.*}}!"_ZTS2C1"
+  // MS: type.test{{.*}}!"?AUC1@@"
+  c1->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C2"
+  // MS: type.test{{.*}}!"?AUC2@@"
+  c2->f();
+  // ITANIUM: type.test{{.*}}!"_ZTS2C3"
+  // MS-NOT: type.test{{.*}}!"?AUC3@@"
+  c3->f();
+  // ITANIUM: type.test{{.*}}!"_ZTS2C4"
+  // MS-NOT: type.test{{.*}}!"?AUC4@@"
+  c4->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C5"
+  // MS-NOT: type.test{{.*}}!"?AUC5@@"
+  c5->f();
+  // ITANIUM-NOT: type.test{{.*}}!"_ZTS2C6"
+  // MS-NOT: type.test{{.*}}!"?AUC6@@"
+  c6->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSSt2C7"
+  // MS-STD: type.test{{.*}}!"?AUC7@std@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC7@std@@"
+  c7->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSNSt2C72C8E"
+  // MS-STD: type.test{{.*}}!"?AUC8@C7@std@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC8@C7@std@@"
+  c8->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSN6stdext2C9E"
+  // MS-STD: type.test{{.*}}!"?AUC9@stdext@@"
+  // MS-NOSTD-NOT: type.test{{.*}}!"?AUC9@stdext@@"
+  c9->f();
+  // ITANIUM: type.test{{.*}}!"_ZTSN5other3C10E"
+  // MS: type.test{{.*}}!"?AUC10@other@@"
+  c10->f();
+  // ITANIUM: type.test{{.*}}!{{[0-9]}}
+  // MS: type.test{{.*}}!{{[0-9]}}
+  C11 *c11;
+  c11->f();
+}
diff --git a/test/CodeGenCXX/mangle-abi-tag.cpp b/test/CodeGenCXX/mangle-abi-tag.cpp
new file mode 100644
index 0000000000000..385a16f26a514
--- /dev/null
+++ b/test/CodeGenCXX/mangle-abi-tag.cpp
@@ -0,0 +1,205 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple %itanium_abi_triple -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple i686-linux-gnu -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple x86_64-linux-gnu -std=c++11 -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -triple powerpc64le-unknown-linux-gnu -std=c++11 -o - | FileCheck %s
+
+struct __attribute__((abi_tag("A", "B"))) A { };
+
+struct B: A { };
+
+template<class T>
+
+struct C {
+};
+
+struct D { A* p; };
+
+template<class T>
+struct __attribute__((abi_tag("C", "D"))) E {
+};
+
+struct __attribute__((abi_tag("A", "B"))) F { };
+
+A a1;
+// CHECK-DAG: @_Z2a1B1AB1B =
+
+__attribute__((abi_tag("C", "D")))
+A a2;
+// CHECK-DAG: @_Z2a2B1AB1BB1CB1D =
+
+B a3;
+// CHECK-DAG: @a3 =
+
+C<A> a4;
+// CHECK-DAG: @_Z2a4B1AB1B =
+
+D a5;
+// CHECK-DAG: @a5 =
+
+E<int> a6;
+// CHECK-DAG: @_Z2a6B1CB1D =
+
+E<A> a7;
+// CHECK-DAG: @_Z2a7B1AB1BB1CB1D =
+
+template<>
+struct E<float> {
+  static float a8;
+};
+float E<float>::a8;
+// CHECK-DAG: @_ZN1EB1CB1DIfE2a8E =
+
+template<>
+struct E<F> {
+  static bool a9;
+};
+bool E<F>::a9;
+// CHECK-DAG: @_ZN1EB1CB1DI1FB1AB1BE2a9E =
+
+struct __attribute__((abi_tag("A", "B"))) A10 {
+  virtual ~A10() {}
+} a10;
+// vtable
+// CHECK-DAG: @_ZTV3A10B1AB1B =
+// typeinfo
+// CHECK-DAG: @_ZTI3A10B1AB1B =
+
+struct __attribute__((abi_tag("A"))) B11 {
+  static A10 b;
+};
+A10 B11::b;
+// B11[abi:A]::b[abi:B]
+// CHECK-DAG: @_ZN3B11B1A1bB1BE =
+
+__attribute__ ((abi_tag("C", "D")))
+void* f1() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f1B1CB1Dv(
+
+__attribute__ ((abi_tag("C", "D")))
+A* f2() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f2B1AB1BB1CB1Dv(
+
+B* f3() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f3v(
+
+C<A>* f4() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f4B1AB1Bv(
+
+D* f5() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f5v(
+
+E<char>* f6() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f6B1CB1Dv(
+
+E<A>* f7() {
+  return 0;
+}
+// CHECK-DAG: define {{.*}} @_Z2f7B1AB1BB1CB1Dv(
+
+void f8(E<A>*) {
+}
+// CHECK-DAG: define {{.*}} @_Z2f8P1EB1CB1DI1AB1AB1BE(
+
+inline namespace Names1 __attribute__((__abi_tag__)) {
+    class C1 {};
+}
+C1 f9() { return C1(); }
+// CHECK-DAG: @_Z2f9B6Names1v(
+
+inline namespace Names2 __attribute__((__abi_tag__("Tag1", "Tag2"))) {
+    class C2 {};
+}
+C2 f10() { return C2(); }
+// CHECK-DAG: @_Z3f10B4Tag1B4Tag2v(
+
+void __attribute__((abi_tag("A"))) f11(A) {}
+// f11[abi:A](A[abi:A][abi:B])
+// CHECK-DAG: define {{.*}} @_Z3f11B1A1AB1AB1B(
+
+A f12(A) { return A(); }
+// f12(A[abi:A][abi:B])
+// CHECK-DAG: define {{.*}} @_Z3f121AB1AB1B(
+
+inline void f13() {
+  struct L {
+    static E<int>* foo() {
+      static A10 a;
+      return 0;
+    }
+  };
+  L::foo();
+}
+void f13_test() {
+  f13();
+}
+// f13()::L::foo[abi:C][abi:D]()
+// CHECK-DAG: define linkonce_odr %struct.E* @_ZZ3f13vEN1L3fooB1CB1DEv(
+
+// f13()::L::foo[abi:C][abi:D]()::a[abi:A][abi:B]
+// CHECK-DAG: @_ZZZ3f13vEN1L3fooB1CB1DEvE1aB1AB1B =
+
+// guard variable for f13()::L::foo[abi:C][abi:D]()::a[abi:A][abi:B]
+// CHECK-DAG: @_ZGVZZ3f13vEN1L3fooB1CB1DEvE1aB1AB1B =
+
+struct __attribute__((abi_tag("TAG"))) A14 {
+  A14 f14();
+};
+A14 A14::f14() {
+  return A14();
+}
+// A14[abi:TAG]::f14()
+// CHECK-DAG: define {{.+}} @_ZN3A14B3TAG3f14Ev(
+
+template<class T>
+T f15() {
+  return T();
+}
+void f15_test() {
+  f15<A14>();
+}
+// A14[abi:TAG] f15<A14[abi:TAG]>()
+// CHECK-DAG: define linkonce_odr {{.+}} @_Z3f15I3A14B3TAGET_v(
+
+template<class T>
+A14 f16() {
+  return A14();
+}
+void f16_test() {
+  f16<int>();
+}
+// A14[abi:TAG] f16<int>()
+// CHECK-DAG: define linkonce_odr {{.+}} @_Z3f16IiE3A14B3TAGv(
+
+template<class T>
+struct __attribute__((abi_tag("TAG"))) A17 {
+  A17 operator+(const A17& a) {
+    return a;
+  }
+};
+void f17_test() {
+  A17<int> a, b;
+  a + b;
+}
+// A17[abi:TAG]<int>::operator+(A17[abi:TAG]<int> const&)
+// CHECK-DAG: define linkonce_odr {{.+}} @_ZN3A17B3TAGIiEplERKS0_(
+
+struct A18 {
+  operator A() { return A(); }
+};
+void f18_test() {
+  A a = A18();
+}
+// A18::operator A[abi:A][abi:B]() but GCC adds the same tags twice!
+// CHECK-DAG: define linkonce_odr {{.+}} @_ZN3A18cv1AB1AB1BEv(
diff --git a/test/CodeGenCXX/mangle-address-space.cpp b/test/CodeGenCXX/mangle-address-space.cpp
index f18480de83dee..cd10384594ea9 100644
--- a/test/CodeGenCXX/mangle-address-space.cpp
+++ b/test/CodeGenCXX/mangle-address-space.cpp
@@ -10,3 +10,6 @@ typedef OpaqueType __attribute__((address_space(100))) * OpaqueTypePtr;
 
 // CHECK-LABEL: define {{.*}}void @_Z2f0PU5AS10010OpaqueType
 void f0(OpaqueTypePtr) { }
+
+// CHECK-LABEL: define {{.*}}void @_Z2f1PU3AS1Kc
+void f1(char __attribute__((address_space(1))) const *p) {}
+\ No newline at end of file
diff --git a/test/CodeGenCXX/mangle-ms-cxx11.cpp b/test/CodeGenCXX/mangle-ms-cxx11.cpp
index 999def87fc5ea..8e2577b03e267 100644
--- a/test/CodeGenCXX/mangle-ms-cxx11.cpp
+++ b/test/CodeGenCXX/mangle-ms-cxx11.cpp
@@ -166,14 +166,14 @@ inline int define_lambda() {
   static auto lambda = [] { static int local; ++local; return local; };
 // First, we have the static local variable of type "<lambda_1>" inside of
 // "define_lambda".
-// CHECK-DAG: @"\01?lambda@?1??define_lambda@@YAHXZ@4V<lambda_1>@?1@YAHXZ@A"
+// CHECK-DAG: @"\01?lambda@?1??define_lambda@@YAHXZ@4V<lambda_1>@?0??1@YAHXZ@A"
 // Next, we have the "operator()" for "<lambda_1>" which is inside of
 // "define_lambda".
-// CHECK-DAG: @"\01??R<lambda_1>@?define_lambda@@YAHXZ@QBEHXZ"
+// CHECK-DAG: @"\01??R<lambda_1>@?0??define_lambda@@YAHXZ@QBE@XZ"
 // Finally, we have the local which is inside of "<lambda_1>" which is inside of
 // "define_lambda". Hooray.
-// MSVC2013-DAG: @"\01?local@?2???R<lambda_1>@?define_lambda@@YAHXZ@QBEHXZ@4HA"
-// MSVC2015-DAG: @"\01?local@?1???R<lambda_1>@?define_lambda@@YAHXZ@QBEHXZ@4HA"
+// MSVC2013-DAG: @"\01?local@?2???R<lambda_1>@?0??define_lambda@@YAHXZ@QBE@XZ@4HA"
+// MSVC2015-DAG: @"\01?local@?1???R<lambda_1>@?0??define_lambda@@YAHXZ@QBE@XZ@4HA"
   return lambda();
 }
 
@@ -182,12 +182,12 @@ void use_lambda_arg(T) {}
 
 inline void call_with_lambda_arg1() {
   use_lambda_arg([]{});
-  // CHECK-DAG: @"\01??$use_lambda_arg@V<lambda_1>@?call_with_lambda_arg1@@YAXXZ@@@YAXV<lambda_1>@?call_with_lambda_arg1@@YAXXZ@@Z"
+  // CHECK-DAG: @"\01??$use_lambda_arg@V<lambda_1>@?0??call_with_lambda_arg1@@YAXXZ@@@YAXV<lambda_1>@?0??call_with_lambda_arg1@@YAXXZ@@Z"
 }
 
 inline void call_with_lambda_arg2() {
   use_lambda_arg([]{});
-  // CHECK-DAG: @"\01??$use_lambda_arg@V<lambda_1>@?call_with_lambda_arg2@@YAXXZ@@@YAXV<lambda_1>@?call_with_lambda_arg2@@YAXXZ@@Z"
+  // CHECK-DAG: @"\01??$use_lambda_arg@V<lambda_1>@?0??call_with_lambda_arg2@@YAXXZ@@@YAXV<lambda_1>@?0??call_with_lambda_arg2@@YAXXZ@@Z"
 }
 
 int call_lambda() {
@@ -286,3 +286,35 @@ static union {
 };
 // CHECK-DAG: @"\01??$f@T<unnamed-type-$S1>@PR18204@@@PR18204@@YAHPAT<unnamed-type-$S1>@0@@Z"
 }
+
+int PR26105() {
+  auto add = [](int x) { return ([x](int y) { return x + y; }); };
+  return add(3)(4);
+}
+// CHECK-DAG: @"\01??R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z"
+// CHECK-DAG: @"\01??R<lambda_1>@?0???R<lambda_0>@?0??PR26105@@YAHXZ@QBE@H@Z@QBE@H@Z"
+
+int __unaligned * unaligned_foo1() { return 0; }
+int __unaligned * __unaligned * unaligned_foo2() { return 0; }
+__unaligned int unaligned_foo3() { return 0; }
+void unaligned_foo4(int __unaligned *p1) {}
+void unaligned_foo5(int __unaligned * __restrict p1) {}
+template <typename T> T unaligned_foo6(T t) { return t; }
+void unaligned_foo7() { unaligned_foo6<int *>(0); unaligned_foo6<int __unaligned *>(0); }
+
+// CHECK-DAG: @"\01?unaligned_foo1@@YAPFAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo2@@YAPFAPFAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo3@@YAHXZ"
+// CHECK-DAG: @"\01?unaligned_foo4@@YAXPFAH@Z"
+// CHECK-DAG: @"\01?unaligned_foo5@@YAXPIFAH@Z"
+// CHECK-DAG: @"\01??$unaligned_foo6@PAH@@YAPAHPAH@Z"
+// CHECK-DAG: @"\01??$unaligned_foo6@PFAH@@YAPFAHPFAH@Z"
+
+// __unaligned qualifier for function types
+struct unaligned_foo8_S {
+    void unaligned_foo8() volatile __unaligned;
+};
+void unaligned_foo8_S::unaligned_foo8() volatile __unaligned {}
+
+// CHECK-DAG: @"\01?unaligned_foo8@unaligned_foo8_S@@QFCEXXZ"
+
diff --git a/test/CodeGenCXX/mangle-ms-cxx14.cpp b/test/CodeGenCXX/mangle-ms-cxx14.cpp
index 9d30c406c8c72..798a390aeaaa1 100644
--- a/test/CodeGenCXX/mangle-ms-cxx14.cpp
+++ b/test/CodeGenCXX/mangle-ms-cxx14.cpp
@@ -35,12 +35,12 @@ auto TemplateFuncionWithLocalLambda(T) {
   return LocalLambdaWithLocalType();
 }
 
-// MSVC2013-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?2???R<lambda_1>@??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
-// MSVC2013-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?2???R<lambda_1>@??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
-// MSVC2015-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?1???R<lambda_1>@??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
-// MSVC2015-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?1???R<lambda_1>@??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
+// MSVC2013-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?2???R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
+// MSVC2013-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?2???R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
+// MSVC2015-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?1???R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
+// MSVC2015-DAG: "\01?ValueFromTemplateFuncionWithLocalLambda@@3ULocalType@?1???R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?3@XZ@A"
 // CHECK-DAG: "\01??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z"
-// CHECK-DAG: "\01??R<lambda_1>@??$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?1@XZ"
+// CHECK-DAG: "\01??R<lambda_1>@?0???$TemplateFuncionWithLocalLambda@H@@YA?A?<auto>@@H@Z@QBE?A?1@XZ"
 auto ValueFromTemplateFuncionWithLocalLambda = TemplateFuncionWithLocalLambda(0);
 
 struct S;
@@ -55,3 +55,8 @@ struct Foo {};
 
 Foo<&x<int>, &x<int>> Zoo;
 // CHECK-DAG: "\01?Zoo@@3U?$Foo@$1??$x@H@@3HA$1?1@3HA@@A"
+
+template <typename T> T unaligned_x;
+extern auto test_unaligned() { return unaligned_x<int __unaligned *>; }
+// CHECK-DAG: "\01??$unaligned_x@PFAH@@3PFAHA"
+
diff --git a/test/CodeGenCXX/mangle-ms-md5.cpp b/test/CodeGenCXX/mangle-ms-md5.cpp
new file mode 100644
index 0000000000000..aef2683048328
--- /dev/null
+++ b/test/CodeGenCXX/mangle-ms-md5.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -emit-llvm -o - -triple i686-pc-win32 %s | FileCheck %s
+int xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx;
+// CHECK-DAG: @"\01??@bf7ea7b95f260b0b24e7f1e8fc8370ab@" = global i32 0, align 4
+
+struct yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy {
+  yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy();
+  virtual void f();
+};
+yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy::yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy() {}
+// CHECK-DAG: @"\01??@a6a285da2eea70dba6b578022be61d81@??_R4@" = linkonce_odr constant %rtti.CompleteObjectLocator
+// CHECK-DAG: @"\01??@a6a285da2eea70dba6b578022be61d81@" = unnamed_addr alias
diff --git a/test/CodeGenCXX/mangle-ms.cpp b/test/CodeGenCXX/mangle-ms.cpp
index c82fca49f6136..ee0f50e5e2d32 100644
--- a/test/CodeGenCXX/mangle-ms.cpp
+++ b/test/CodeGenCXX/mangle-ms.cpp
@@ -4,6 +4,11 @@
 int a;
 // CHECK-DAG: @"\01?a@@3HA"
 
+extern "C++" {
+static int __attribute__((used)) ignore_transparent_context;
+// CHECK-DAG: @ignore_transparent_context
+}
+
 namespace N {
   int b;
 // CHECK-DAG: @"\01?b@N@@3HA"
diff --git a/test/CodeGenCXX/mangle-template.cpp b/test/CodeGenCXX/mangle-template.cpp
index 7fa300ae237b5..23134693de5c0 100644
--- a/test/CodeGenCXX/mangle-template.cpp
+++ b/test/CodeGenCXX/mangle-template.cpp
@@ -201,3 +201,14 @@ namespace test14 {
 
   int call(bool b) { return inl<void>(b); }
 }
+
+namespace std {
+template <class _Tp, _Tp...> struct integer_sequence {};
+}
+
+namespace test15 {
+template <int N>
+__make_integer_seq<std::integer_sequence, int, N> make() {}
+template __make_integer_seq<std::integer_sequence, int, 5> make<5>();
+// CHECK: define weak_odr {{.*}} @_ZN6test154makeILi5EEE18__make_integer_seqISt16integer_sequenceiXT_EEv(
+}
diff --git a/test/CodeGenCXX/mangle.cpp b/test/CodeGenCXX/mangle.cpp
index 5012c3b37981e..5d757102ed25f 100644
--- a/test/CodeGenCXX/mangle.cpp
+++ b/test/CodeGenCXX/mangle.cpp
@@ -1101,3 +1101,13 @@ struct c {
   // CHECK-LABEL: @_ZN6test541cC2EPNS0_Ut0_E
 };
 }
+
+namespace test55 {
+enum E { R };
+
+template <typename T>
+void fn(T, __underlying_type(T)) {}
+
+template void fn<E>(E, __underlying_type(E));
+// CHECK-LABEL: @_ZN6test552fnINS_1EEEEvT_U3eutS2_
+}
diff --git a/test/CodeGenCXX/microsoft-abi-array-cookies.cpp b/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
index 75c0621347a5f..9ef1879ce5f26 100644
--- a/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
+++ b/test/CodeGenCXX/microsoft-abi-array-cookies.cpp
@@ -7,7 +7,7 @@ struct ClassWithoutDtor {
 void check_array_no_cookies() {
 // CHECK: define void @"\01?check_array_no_cookies@@YAXXZ"() [[NUW:#[0-9]+]]
 
-// CHECK: call noalias i8* @"\01??_U@YAPAXI@Z"(i32 42)
+// CHECK: call i8* @"\01??_U@YAPAXI@Z"(i32 42)
   ClassWithoutDtor *array = new ClassWithoutDtor[42];
 
 // CHECK: call void @"\01??_V@YAXPAX@Z"(
@@ -24,7 +24,7 @@ void check_array_cookies_simple() {
 // CHECK: define {{.*}} @"\01?check_array_cookies_simple@@YAXXZ"()
 
   ClassWithDtor *array = new ClassWithDtor[42];
-// CHECK: [[ALLOCATED:%.*]] = call noalias i8* @"\01??_U@YAPAXI@Z"(i32 46)
+// CHECK: [[ALLOCATED:%.*]] = call i8* @"\01??_U@YAPAXI@Z"(i32 46)
 // 46 = 42 + size of cookie (4)
 // CHECK: [[COOKIE:%.*]] = bitcast i8* [[ALLOCATED]] to i32*
 // CHECK: store i32 42, i32* [[COOKIE]]
@@ -46,7 +46,7 @@ struct __attribute__((aligned(8))) ClassWithAlignment {
 void check_array_cookies_aligned() {
 // CHECK: define {{.*}} @"\01?check_array_cookies_aligned@@YAXXZ"()
   ClassWithAlignment *array = new ClassWithAlignment[42];
-// CHECK: [[ALLOCATED:%.*]] = call noalias i8* @"\01??_U@YAPAXI@Z"(i32 344)
+// CHECK: [[ALLOCATED:%.*]] = call i8* @"\01??_U@YAPAXI@Z"(i32 344)
 //   344 = 42*8 + size of cookie (8, due to alignment)
 // CHECK: [[COOKIE:%.*]] = bitcast i8* [[ALLOCATED]] to i32*
 // CHECK: store i32 42, i32* [[COOKIE]]
diff --git a/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp b/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
index da58c461dcc09..6da7a50b617ea 100644
--- a/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
+++ b/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp
@@ -2,10 +2,10 @@
 
 // PR15768
 
-// A trivial 12 byte struct is returned indirectly.
+// A trivial 20 byte struct is returned indirectly and taken as byval.
 struct S {
   S();
-  int a, b, c;
+  int a, b, c, d, e;
 };
 
 struct C {
diff --git a/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp b/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
index e9eba6ed0b209..f03cd6c5632b3 100644
--- a/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
+++ b/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
@@ -60,7 +60,7 @@ T* test5(A* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, i32* [[VBTBL]], i32 1
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[VOIDP]], i32 [[VBOFFS]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUA@@@8" to i8*), i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[VBOFFS]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUA@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
@@ -78,7 +78,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, i32* [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, i8* [[CAST]], i32 [[DELTA]]
-// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUB@@@8" to i8*), i8* nonnull bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
+// CHECK-NEXT:   [[CALL:%.*]] = tail call i8* @__RTDynamicCast(i8* [[ADJ]], i32 [[DELTA]], i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUB@@@8" to i8*), i8* {{.*}}bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUT@@@8" to i8*), i32 0)
 // CHECK-NEXT:   [[RES:%.*]] = bitcast i8* [[CALL]] to %struct.T*
 // CHECK-NEXT:   br label
 // CHECK:        [[RET:%.*]] = phi %struct.T*
diff --git a/test/CodeGenCXX/microsoft-abi-eh-catch.cpp b/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
index 69ec34754813e..ac1321efeae2a 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-catch.cpp
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc \
-// RUN:     -mconstructor-aliases -fexceptions -fcxx-exceptions -fnew-ms-eh \
+// RUN:     -mconstructor-aliases -fexceptions -fcxx-exceptions \
 // RUN:     -O1 -disable-llvm-optzns \
 // RUN:     | FileCheck -check-prefix WIN64 %s
 
diff --git a/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp b/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
index bf05c693ec0dd..004dc45652e4c 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 -check-prefix WIN32-O0 %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm -O3 -disable-llvm-optzns %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fexceptions -fcxx-exceptions -fno-rtti | FileCheck -check-prefix WIN32 -check-prefix WIN32-O3 -check-prefix WIN32-LIFETIME %s
 
 struct A {
   A();
@@ -94,40 +95,78 @@ int HasConditionalDeactivatedCleanups(bool cond) {
   return (cond ? TakesTwo((TakeRef(A()), A()), (TakeRef(A()), A())) : CouldThrow());
 }
 
-// WIN32-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
-// WIN32:   alloca i1
-// WIN32:   %[[arg1_cond:.*]] = alloca i1
+// WIN32-O0-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
+// WIN32-O0:   alloca i1
+// WIN32-O0:   %[[arg1_cond:.*]] = alloca i1
 //        Start all four cleanups as deactivated.
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   store i1 false
-// WIN32:   br i1
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   store i1 false
+// WIN32-O0:   br i1
 //        True condition.
-// WIN32:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true, i1* %[[arg1_cond]]
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
-// WIN32:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
-// WIN32:   store i1 true
-// WIN32:   store i1 false, i1* %[[arg1_cond]]
-// WIN32:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
+// WIN32-O0:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true, i1* %[[arg1_cond]]
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O0:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O0:   store i1 true
+// WIN32-O0:   store i1 false, i1* %[[arg1_cond]]
+// WIN32-O0:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
 //        False condition.
-// WIN32:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
+// WIN32-O0:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
 //        Two normal cleanups for TakeRef args.
-// WIN32:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
-// WIN32-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
-// WIN32:   ret i32
+// WIN32-O0:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O0-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
+// WIN32-O0:   ret i32
 //
 //        Somewhere in the landing pad soup, we conditionally destroy arg1.
-// WIN32:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
-// WIN32:   br i1 %[[isactive]]
-// WIN32:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
-// WIN32: }
+// WIN32-O0:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
+// WIN32-O0:   br i1 %[[isactive]]
+// WIN32-O0:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O0: }
+
+// WIN32-O3-LABEL: define i32 @"\01?HasConditionalDeactivatedCleanups@@YAH_N@Z"{{.*}} {
+// WIN32-O3:   alloca i1
+// WIN32-O3:   alloca i1
+// WIN32-O3:   %[[arg1_cond:.*]] = alloca i1
+//        Start all four cleanups as deactivated.
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   store i1 false
+// WIN32-O3:   br i1
+//        True condition.
+// WIN32-O3:   call x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true, i1* %[[arg1_cond]]
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   invoke void @"\01?TakeRef@@YAXABUA@@@Z"
+// WIN32-O3:   invoke x86_thiscallcc %struct.A* @"\01??0A@@QAE@XZ"
+// WIN32-O3:   store i1 true
+// WIN32-O3:   store i1 false, i1* %[[arg1_cond]]
+// WIN32-O3:   invoke i32 @"\01?TakesTwo@@YAHUA@@0@Z"
+//        False condition.
+// WIN32-O3:   invoke i32 @"\01?CouldThrow@@YAHXZ"()
+//        Two normal cleanups for TakeRef args.
+// WIN32-O3:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O3-NOT:   invoke x86_thiscallcc void @"\01??1A@@QAE@XZ"
+// WIN32-O3:   ret i32
+//
+//        Somewhere in the landing pad soup, we conditionally destroy arg1.
+// WIN32-O3:   %[[isactive:.*]] = load i1, i1* %[[arg1_cond]]
+// WIN32-O3:   br i1 %[[isactive]]
+// WIN32-O3:   call x86_thiscallcc void @"\01??1A@@QAE@XZ"({{.*}})
+// WIN32-O3: }
 
 namespace crash_on_partial_destroy {
 struct A {
@@ -206,3 +245,36 @@ void f() {
 // WIN32: cleanuppad
 // WIN32: call x86_thiscallcc void @"\01??1D@noexcept_false_dtor@@QAE@XZ"(%"struct.noexcept_false_dtor::D"* %{{.*}})
 // WIN32: cleanupret
+
+namespace lifetime_marker {
+struct C {
+  ~C();
+};
+void g();
+void f() {
+  C c;
+  g();
+}
+
+// WIN32-LIFETIME-LABEL: define void @"\01?f@lifetime_marker@@YAXXZ"()
+// WIN32-LIFETIME: %[[c:.*]] = alloca %"struct.lifetime_marker::C"
+// WIN32-LIFETIME: %[[bc0:.*]] = bitcast %"struct.lifetime_marker::C"* %c to i8*
+// WIN32-LIFETIME: call void @llvm.lifetime.start(i64 1, i8* %[[bc0]])
+// WIN32-LIFETIME: invoke void @"\01?g@lifetime_marker@@YAXXZ"()
+// WIN32-LIFETIME-NEXT: to label %[[cont:[^ ]*]] unwind label %[[lpad0:[^ ]*]]
+//
+// WIN32-LIFETIME: [[cont]]
+// WIN32-LIFETIME: call x86_thiscallcc void @"\01??1C@lifetime_marker@@QAE@XZ"({{.*}})
+// WIN32-LIFETIME: %[[bc1:.*]] = bitcast %"struct.lifetime_marker::C"* %[[c]] to i8*
+// WIN32-LIFETIME: call void @llvm.lifetime.end(i64 1, i8* %[[bc1]])
+//
+// WIN32-LIFETIME: [[lpad0]]
+// WIN32-LIFETIME-NEXT: cleanuppad
+// WIN32-LIFETIME: call x86_thiscallcc void @"\01??1C@lifetime_marker@@QAE@XZ"({{.*}})
+// WIN32-LIFETIME: cleanupret {{.*}} unwind label %[[lpad1:[^ ]*]]
+//
+// WIN32-LIFETIME: [[lpad1]]
+// WIN32-LIFETIME-NEXT: cleanuppad
+// WIN32-LIFETIME: %[[bc2:.*]] = bitcast %"struct.lifetime_marker::C"* %[[c]] to i8*
+// WIN32-LIFETIME: call void @llvm.lifetime.end(i64 1, i8* %[[bc2]])
+}
diff --git a/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp b/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
index 0b8d270e13792..7836dcf32b118 100644
--- a/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
+++ b/test/CodeGenCXX/microsoft-abi-eh-terminate.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=18.00 | FileCheck -check-prefix=MSVC2013 %s
-// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=19.00 | FileCheck -check-prefix=MSVC2015 %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=18.00 | FileCheck -check-prefix=MSVC2013 -check-prefix=CHECK %s
+// RUN: %clang_cc1 -std=c++11 -emit-llvm %s -o - -triple=x86_64-pc-windows-msvc -mconstructor-aliases -fexceptions -fcxx-exceptions -fms-compatibility-version=19.00 | FileCheck -check-prefix=MSVC2015 -check-prefix=CHECK %s
 
 void may_throw();
 void never_throws() noexcept(true) {
@@ -9,7 +9,8 @@ void never_throws() noexcept(true) {
 // CHECK-LABEL: define void @"\01?never_throws@@YAXXZ"()
 // CHECK-SAME:          personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
 // CHECK:      invoke void @"\01?may_throw@@YAXXZ"()
-// CHECK:      cleanuppad within none []
+// CHECK:      %[[cp:.*]] = cleanuppad within none []
 // MSVC2013:      call void @"\01?terminate@@YAXXZ"()
 // MSVC2015:      call void @__std_terminate()
+// CHECK-SAME:  [ "funclet"(token %[[cp]]) ]
 // CHECK-NEXT: unreachable
diff --git a/test/CodeGenCXX/microsoft-abi-extern-template.cpp b/test/CodeGenCXX/microsoft-abi-extern-template.cpp
new file mode 100644
index 0000000000000..de46d5b5c37c4
--- /dev/null
+++ b/test/CodeGenCXX/microsoft-abi-extern-template.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -fno-rtti-data -O1 -disable-llvm-optzns %s -emit-llvm -o - -triple x86_64-windows-msvc | FileCheck %s
+
+// Even though Foo<int> has an extern template declaration, we have to emit our
+// own copy the vftable when emitting the available externally constructor.
+
+// CHECK: @"\01??_7?$Foo@H@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [
+// CHECK-SAME:   i8* bitcast (i8* (%struct.Foo*, i32)* @"\01??_G?$Foo@H@@UEAAPEAXI@Z" to i8*)
+// CHECK-SAME: ], comdat
+
+// CHECK-LABEL: define %struct.Foo* @"\01?f@@YAPEAU?$Foo@H@@XZ"()
+// CHECK: call %struct.Foo* @"\01??0?$Foo@H@@QEAA@XZ"(%struct.Foo* %{{.*}})
+
+// CHECK: define available_externally %struct.Foo* @"\01??0?$Foo@H@@QEAA@XZ"(%struct.Foo* returned %this)
+// CHECK:   store {{.*}} @"\01??_7?$Foo@H@@6B@"
+
+// CHECK: define linkonce_odr i8* @"\01??_G?$Foo@H@@UEAAPEAXI@Z"(%struct.Foo* %this, i32 %should_call_delete)
+
+struct Base {
+  virtual ~Base();
+};
+template <typename T> struct Foo : Base {
+  Foo() {}
+};
+extern template class Foo<int>;
+Foo<int> *f() { return new Foo<int>(); }
diff --git a/test/CodeGenCXX/microsoft-abi-member-pointers.cpp b/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
index fd22c0034203c..a3985ba09c028 100755
--- a/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
+++ b/test/CodeGenCXX/microsoft-abi-member-pointers.cpp
@@ -3,16 +3,29 @@
 // RUN: %clang_cc1 -std=c++11 -Wno-uninitialized -fno-rtti -emit-llvm %s -o - -triple=i386-pc-win32 -DINCOMPLETE_VIRTUAL -fms-extensions -verify
 // RUN: %clang_cc1 -std=c++11 -Wno-uninitialized -fno-rtti -emit-llvm %s -o - -triple=i386-pc-win32 -DINCOMPLETE_VIRTUAL -DMEMFUN -fms-extensions -verify
 
+struct PR26313_Y;
+typedef void (PR26313_Y::*PR26313_FUNC)();
+struct PR26313_X {
+  PR26313_FUNC *ptr;
+  PR26313_X();
+};
+PR26313_X::PR26313_X() {}
+void PR26313_f(PR26313_FUNC *p) { delete p; }
+
+struct PR26313_Z;
+int PR26313_Z::**a = nullptr;
+int PR26313_Z::*b = *a;
+// CHECK-DAG: @"\01?a@@3PAPQPR26313_Z@@HA" = global %0* null, align 4
+// CHECK-DAG: @"\01?b@@3PQPR26313_Z@@HQ1@" = global { i32, i32, i32 } { i32 0, i32 0, i32 -1 }, align 4
+
 namespace PR20947 {
 struct A;
 int A::**a = nullptr;
-// CHECK: %[[opaque0:.*]] = type opaque
-// CHECK: %[[opaque1:.*]] = type opaque
-// CHECK: @"\01?a@PR20947@@3PAPQA@1@HA" = global %[[opaque0]]* null, align 4
+// CHECK-DAG: @"\01?a@PR20947@@3PAPQA@1@HA" = global %{{.*}}* null, align 4
 
 struct B;
 int B::*&b = b;
-// CHECK: @"\01?b@PR20947@@3AAPQB@1@HA" = global %[[opaque1]]* null, align 4
+// CHECK-DAG: @"\01?b@PR20947@@3AAPQB@1@HA" = global %{{.*}}* null, align 4
 }
 
 namespace PR20017 {
diff --git a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
index 4c2d8506af708..f7dc524067406 100644
--- a/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
+++ b/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp
@@ -22,6 +22,16 @@ struct SmallWithCtor {
   int x;
 };
 
+struct Multibyte {
+  char a, b, c, d;
+};
+
+struct Packed {
+  short a;
+  int b;
+  short c;
+};
+
 struct SmallWithDtor {
   SmallWithDtor();
   ~SmallWithDtor();
@@ -102,19 +112,30 @@ Big big_return() { return Big(); }
 
 void small_arg(Small s) {}
 // LINUX-LABEL: define void @_Z9small_arg5Small(i32 %s.0)
-// WIN32: define void @"\01?small_arg@@YAXUSmall@@@Z"(%struct.Small* byval align 4 %s)
+// WIN32: define void @"\01?small_arg@@YAXUSmall@@@Z"(i32 %s.0)
 // WIN64: define void @"\01?small_arg@@YAXUSmall@@@Z"(i32 %s.coerce)
 
 void medium_arg(Medium s) {}
 // LINUX-LABEL: define void @_Z10medium_arg6Medium(i32 %s.0, i32 %s.1)
-// WIN32: define void @"\01?medium_arg@@YAXUMedium@@@Z"(%struct.Medium* byval align 4 %s)
+// WIN32: define void @"\01?medium_arg@@YAXUMedium@@@Z"(i32 %s.0, i32 %s.1)
 // WIN64: define void @"\01?medium_arg@@YAXUMedium@@@Z"(i64 %s.coerce)
 
 void small_arg_with_ctor(SmallWithCtor s) {}
 // LINUX-LABEL: define void @_Z19small_arg_with_ctor13SmallWithCtor(%struct.SmallWithCtor* byval align 4 %s)
-// WIN32: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(%struct.SmallWithCtor* byval align 4 %s)
+// WIN32: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(i32 %s.0)
 // WIN64: define void @"\01?small_arg_with_ctor@@YAXUSmallWithCtor@@@Z"(i32 %s.coerce)
 
+// FIXME: We could coerce to a series of i32s here if we wanted to.
+void multibyte_arg(Multibyte s) {}
+// LINUX-LABEL: define void @_Z13multibyte_arg9Multibyte(%struct.Multibyte* byval align 4 %s)
+// WIN32: define void @"\01?multibyte_arg@@YAXUMultibyte@@@Z"(%struct.Multibyte* byval align 4 %s)
+// WIN64: define void @"\01?multibyte_arg@@YAXUMultibyte@@@Z"(i32 %s.coerce)
+
+void packed_arg(Packed s) {}
+// LINUX-LABEL: define void @_Z10packed_arg6Packed(%struct.Packed* byval align 4 %s)
+// WIN32: define void @"\01?packed_arg@@YAXUPacked@@@Z"(%struct.Packed* byval align 4 %s)
+// WIN64: define void @"\01?packed_arg@@YAXUPacked@@@Z"(%struct.Packed* %s)
+
 // Test that dtors are invoked in the callee.
 void small_arg_with_dtor(SmallWithDtor s) {}
 // WIN32: define void @"\01?small_arg_with_dtor@@YAXUSmallWithDtor@@@Z"(<{ %struct.SmallWithDtor }>* inalloca) {{.*}} {
@@ -196,6 +217,28 @@ void big_arg(Big s) {}
 // WIN32: define void @"\01?big_arg@@YAXUBig@@@Z"(%struct.Big* byval align 4 %s)
 // WIN64: define void @"\01?big_arg@@YAXUBig@@@Z"(%struct.Big* %s)
 
+// PR27607: We would attempt to load i32 value out of the reference instead of
+// just loading the pointer from the struct during argument expansion.
+struct RefField {
+  RefField(int &x);
+  int &x;
+};
+void takes_ref_field(RefField s) {}
+// LINUX-LABEL: define void @_Z15takes_ref_field8RefField(%struct.RefField* byval align 4 %s)
+// WIN32: define void @"\01?takes_ref_field@@YAXURefField@@@Z"(i32* %s.0)
+// WIN64: define void @"\01?takes_ref_field@@YAXURefField@@@Z"(i64 %s.coerce)
+
+void pass_ref_field() {
+  int x;
+  takes_ref_field(RefField(x));
+}
+// LINUX-LABEL: define void @_Z14pass_ref_fieldv()
+// LINUX: call void @_Z15takes_ref_field8RefField(%struct.RefField* byval align 4 %{{.*}})
+// WIN32-LABEL: define void @"\01?pass_ref_field@@YAXXZ"()
+// WIN32: call void @"\01?takes_ref_field@@YAXURefField@@@Z"(i32* %{{.*}})
+// WIN64-LABEL: define void @"\01?pass_ref_field@@YAXXZ"()
+// WIN64: call void @"\01?takes_ref_field@@YAXURefField@@@Z"(i64 %{{.*}})
+
 class Class {
  public:
   Small thiscall_method_small() { return Small(); }
@@ -230,12 +273,12 @@ class Class {
 
   void thiscall_method_arg(Small s) {}
   // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE5Small(%class.Class* %this, i32 %s.0)
-  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmall@@@Z"(%class.Class* %this, %struct.Small* byval align 4 %s)
+  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmall@@@Z"(%class.Class* %this, i32 %s.0)
   // WIN64: define linkonce_odr void @"\01?thiscall_method_arg@Class@@QEAAXUSmall@@@Z"(%class.Class* %this, i32 %s.coerce)
 
   void thiscall_method_arg(SmallWithCtor s) {}
   // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE13SmallWithCtor(%class.Class* %this, %struct.SmallWithCtor* byval align 4 %s)
-  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmallWithCtor@@@Z"(%class.Class* %this, %struct.SmallWithCtor* byval align 4 %s)
+  // WIN32: define {{.*}} void @"\01?thiscall_method_arg@Class@@QAEXUSmallWithCtor@@@Z"(%class.Class* %this, i32 %s.0)
   // WIN64: define linkonce_odr void @"\01?thiscall_method_arg@Class@@QEAAXUSmallWithCtor@@@Z"(%class.Class* %this, i32 %s.coerce)
 
   void thiscall_method_arg(Big s) {}
diff --git a/test/CodeGenCXX/microsoft-abi-structors.cpp b/test/CodeGenCXX/microsoft-abi-structors.cpp
index 3fb97b9a36322..a576f0c3d7e37 100644
--- a/test/CodeGenCXX/microsoft-abi-structors.cpp
+++ b/test/CodeGenCXX/microsoft-abi-structors.cpp
@@ -7,7 +7,7 @@
 // RUN: FileCheck --check-prefix DTORS3 %s < %t
 // RUN: FileCheck --check-prefix DTORS4 %s < %t
 //
-// RUN: %clang_cc1 -emit-llvm %s -o - -mconstructor-aliases -triple=x86_64-pc-win32 -fno-rtti | FileCheck --check-prefix DTORS-X64 %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -mconstructor-aliases -triple=x86_64-pc-win32 -fno-rtti -std=c++11 | FileCheck --check-prefix DTORS-X64 %s
 
 namespace basic {
 
@@ -443,6 +443,20 @@ void g() { new MoveOnly(f()); }
 // CHECK: store {{.*}} @"\01??_7MoveOnly@implicit_copy_vtable@@6B@"
 }
 
+namespace delegating_ctor {
+struct Y {};
+struct X : virtual Y {
+  X(int);
+  X();
+};
+X::X(int) : X() {}
+}
+// CHECK: define x86_thiscallcc %"struct.delegating_ctor::X"* @"\01??0X@delegating_ctor@@QAE@H@Z"(
+// CHECK:  %[[is_most_derived_addr:.*]] = alloca i32, align 4
+// CHECK:  store i32 %is_most_derived, i32* %[[is_most_derived_addr]]
+// CHECK:  %[[is_most_derived:.*]] = load i32, i32* %[[is_most_derived_addr]]
+// CHECK:  call x86_thiscallcc {{.*}}* @"\01??0X@delegating_ctor@@QAE@XZ"({{.*}} i32 %[[is_most_derived]])
+
 // Dtor thunks for classes in anonymous namespaces should be internal, not
 // linkonce_odr.
 namespace {
@@ -471,4 +485,3 @@ class G {
 extern void testG() {
   G g;
 }
-
diff --git a/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp b/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
index 29b434eaf2c57..0202586c8a620 100644
--- a/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
+++ b/test/CodeGenCXX/microsoft-abi-thread-safe-statics.cpp
@@ -9,12 +9,14 @@ struct S {
 // CHECK-DAG: @"\01?s@?1??f@@YAAAUS@@XZ@4U2@A" = linkonce_odr thread_local global %struct.S zeroinitializer
 // CHECK-DAG: @"\01??__J?1??f@@YAAAUS@@XZ@51" = linkonce_odr thread_local global i32 0
 // CHECK-DAG: @"\01?s@?1??g@@YAAAUS@@XZ@4U2@A" = linkonce_odr global %struct.S zeroinitializer
-// CHECK-DAG: @"\01?$TSS0@?1??g@@YAAAUS@@XZ" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" = linkonce_odr global i32 0
 // CHECK-DAG: @_Init_thread_epoch = external thread_local global i32, align 4
 // CHECK-DAG: @"\01?j@?1??h@@YAAAUS@@_N@Z@4U2@A" = linkonce_odr thread_local global %struct.S zeroinitializer
 // CHECK-DAG: @"\01??__J?1??h@@YAAAUS@@_N@Z@51" = linkonce_odr thread_local global i32 0
 // CHECK-DAG: @"\01?i@?1??h@@YAAAUS@@_N@Z@4U2@A" = linkonce_odr global %struct.S zeroinitializer
-// CHECK-DAG: @"\01?$TSS0@?1??h@@YAAAUS@@_N@Z" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?$TSS0@?1??h@@YAAAUS@@_N@Z@4HA" = linkonce_odr global i32 0
+// CHECK-DAG: @"\01?i@?1??g1@@YAHXZ@4HA" = internal global i32 0, align 4
+// CHECK-DAG: @"\01?$TSS0@?1??g1@@YAHXZ@4HA" = internal global i32 0, align 4
 
 // CHECK-LABEL: define {{.*}} @"\01?f@@YAAAUS@@XZ"()
 // CHECK-SAME:  personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*)
@@ -51,14 +53,14 @@ extern inline S &f() {
 // CHECK-LABEL: define {{.*}} @"\01?g@@YAAAUS@@XZ"()
 extern inline S &g() {
   static S s;
-// CHECK:  %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ" unordered, align 4
+// CHECK:  %[[guard:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
 // CHECK-NEXT:  %[[epoch:.*]] = load i32, i32* @_Init_thread_epoch
 // CHECK-NEXT:  %[[cmp:.*]] = icmp sgt i32 %[[guard]], %[[epoch]]
 // CHECK-NEXT:  br i1 %[[cmp]], label %[[init_attempt:.*]], label %[[init_end:.*]]
 //
 // CHECK:     [[init_attempt]]:
-// CHECK-NEXT:  call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
-// CHECK-NEXT:  %[[guard2:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ" unordered, align 4
+// CHECK-NEXT:  call void @_Init_thread_header(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
+// CHECK-NEXT:  %[[guard2:.*]] = load atomic i32, i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA" unordered, align 4
 // CHECK-NEXT:  %[[cmp2:.*]] = icmp eq i32 %[[guard2]], -1
 // CHECK-NEXT:  br i1 %[[cmp2]], label %[[init:.*]], label %[[init_end:.*]]
 //
@@ -68,7 +70,7 @@ extern inline S &g() {
 //
 // CHECK:     [[invoke_cont]]:
 // CHECK-NEXT:  call i32 @atexit(void ()* @"\01??__Fs@?1??g@@YAAAUS@@XZ@YAXXZ")
-// CHECK-NEXT:  call void @_Init_thread_footer(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
+// CHECK-NEXT:  call void @_Init_thread_footer(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
 // CHECK-NEXT:  br label %init.end
 //
 // CHECK:     [[init_end]]:
@@ -76,7 +78,7 @@ extern inline S &g() {
 //
 // CHECK:     [[lpad]]:
 // CHECK-NEXT: cleanuppad within none []
-// CHECK:       call void @_Init_thread_abort(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ")
+// CHECK:       call void @_Init_thread_abort(i32* @"\01?$TSS0@?1??g@@YAAAUS@@XZ@4HA")
 // CHECK-NEXT:  cleanupret {{.*}} unwind to caller
   return s;
 }
@@ -86,3 +88,10 @@ extern inline S&h(bool b) {
   static S i;
   return b ? j : i;
 }
+
+// CHECK-LABEL: define i32 @"\01?g1@@YAHXZ"()
+int f1();
+int g1() {
+  static int i = f1();
+  return i;
+}
diff --git a/test/CodeGenCXX/microsoft-abi-throw.cpp b/test/CodeGenCXX/microsoft-abi-throw.cpp
index 080f1a025cd8b..7c2e2a8f901e1 100644
--- a/test/CodeGenCXX/microsoft-abi-throw.cpp
+++ b/test/CodeGenCXX/microsoft-abi-throw.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -emit-llvm -o - -triple=i386-pc-win32 -std=c++11 %s -fcxx-exceptions -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -o - -triple=i386-pc-win32 -std=c++11 %s -fcxx-exceptions -fms-extensions -DSTD | FileCheck %s
 
 // CHECK-DAG: @"\01??_R0?AUY@@@8" = linkonce_odr global %rtti.TypeDescriptor7 { i8** @"\01??_7type_info@@6B@", i8* null, [8 x i8] c".?AUY@@\00" }, comdat
 // CHECK-DAG: @"_CT??_R0?AUY@@@8??0Y@@QAE@ABU0@@Z8" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 4, i8* bitcast (%rtti.TypeDescriptor7* @"\01??_R0?AUY@@@8" to i8*), i32 0, i32 -1, i32 0, i32 8, i8* bitcast (%struct.Y* (%struct.Y*, %struct.Y*, i32)* @"\01??0Y@@QAE@ABU0@@Z" to i8*) }, section ".xdata", comdat
@@ -19,6 +20,8 @@
 // CHECK-DAG: @"_CT??_R0P6AXXZ@84" = linkonce_odr unnamed_addr constant %eh.CatchableType { i32 1, i8* bitcast (%rtti.TypeDescriptor7* @"\01??_R0P6AXXZ@8" to i8*), i32 0, i32 -1, i32 0, i32 4, i8* null }, section ".xdata", comdat
 // CHECK-DAG: @_CTA1P6AXXZ = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.1 { i32 1, [1 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0P6AXXZ@84"] }, section ".xdata", comdat
 // CHECK-DAG: @_TI1P6AXXZ = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 0, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.1* @_CTA1P6AXXZ to i8*) }, section ".xdata", comdat
+// CHECK-DAG: @_TIU2PAPFAH = linkonce_odr unnamed_addr constant %eh.ThrowInfo { i32 4, i8* null, i8* null, i8* bitcast (%eh.CatchableTypeArray.2* @_CTA2PAPFAH to i8*) }, section ".xdata", comdat
+// CHECK-DAG: @_CTA2PAPFAH = linkonce_odr unnamed_addr constant %eh.CatchableTypeArray.2 { i32 2, [2 x %eh.CatchableType*] [%eh.CatchableType* @"_CT??_R0PAPFAH@84", %eh.CatchableType* @"_CT??_R0PAX@84"] }, section ".xdata", comdat
 
 
 struct N { ~N(); };
@@ -43,6 +46,12 @@ void g(const int *const *y) {
   throw y;
 }
 
+void h(__unaligned int * __unaligned *y) {
+  // CHECK-LABEL: @"\01?h@@YAXPFAPFAH@Z"
+  // CHECK: call void @_CxxThrowException(i8* %{{.*}}, %eh.ThrowInfo* @_TIU2PAPFAH)
+  throw y;
+}
+
 struct Default {
   Default(Default &, int = 42);
 };
@@ -97,19 +106,25 @@ void h() {
   throw nullptr;
 }
 
+#ifdef STD
 namespace std {
 template <typename T>
 void *__GetExceptionInfo(T);
 }
+#else
+template <typename T>
+void *__GetExceptionInfo(T);
+#endif
+using namespace std;
 
 void *GetExceptionInfo_test0() {
 // CHECK-LABEL: @"\01?GetExceptionInfo_test0@@YAPAXXZ"
 // CHECK:  ret i8* bitcast (%eh.ThrowInfo* @_TI1H to i8*)
-  return std::__GetExceptionInfo(0);
+  return __GetExceptionInfo(0);
 }
 
 void *GetExceptionInfo_test1() {
 // CHECK-LABEL: @"\01?GetExceptionInfo_test1@@YAPAXXZ"
 // CHECK:  ret i8* bitcast (%eh.ThrowInfo* @_TI1P6AXXZ to i8*)
-  return std::__GetExceptionInfo<void (*)()>(&h);
+  return __GetExceptionInfo<void (*)()>(&h);
 }
diff --git a/test/CodeGenCXX/microsoft-abi-try-throw.cpp b/test/CodeGenCXX/microsoft-abi-try-throw.cpp
index 6b1d2bf2a513a..bf1834e8594ee 100644
--- a/test/CodeGenCXX/microsoft-abi-try-throw.cpp
+++ b/test/CodeGenCXX/microsoft-abi-try-throw.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTRY -fnew-ms-eh   | FileCheck %s -check-prefix=TRY
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTRY   | FileCheck %s -check-prefix=TRY
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 -mconstructor-aliases -fcxx-exceptions -fexceptions -fno-rtti -DTHROW | FileCheck %s -check-prefix=THROW
 
 // THROW-DAG: @"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
diff --git a/test/CodeGenCXX/microsoft-abi-typeid.cpp b/test/CodeGenCXX/microsoft-abi-typeid.cpp
index 60c31ab4706c1..d73f8483a7139 100644
--- a/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/test/CodeGenCXX/microsoft-abi-typeid.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm -O1 -o - -triple=i386-pc-win32 %s | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm -O1 -o - -triple=i386-pc-win32 %s -fexceptions -fcxx-exceptions | FileCheck %s
 
 struct type_info;
 namespace std { using ::type_info; }
@@ -49,3 +49,22 @@ const std::type_info* test5_typeid() { return &typeid(v); }
 // CHECK:        [[RT:%.*]] = tail call i8* @__RTtypeid(i8* bitcast (%struct.V* @"\01?v@@3UV@@A" to i8*))
 // CHECK-NEXT:   [[RET:%.*]] = bitcast i8* [[RT]] to %struct.type_info*
 // CHECK-NEXT:   ret %struct.type_info* [[RET]]
+
+namespace PR26329 {
+struct Polymorphic {
+  virtual ~Polymorphic();
+};
+
+void f(const Polymorphic &poly) {
+  try {
+    throw;
+  } catch (...) {
+    Polymorphic cleanup;
+    typeid(poly);
+  }
+}
+// CHECK-LABEL: define void @"\01?f@PR26329@@YAXABUPolymorphic@1@@Z"(
+// CHECK: %[[cs:.*]] = catchswitch within none [label %{{.*}}] unwind to caller
+// CHECK: %[[cp:.*]] = catchpad within %[[cs]] [i8* null, i32 64, i8* null]
+// CHECK: invoke i8* @__RTtypeid(i8* {{.*}}) [ "funclet"(token %[[cp]]) ]
+}
diff --git a/test/CodeGenCXX/microsoft-abi-vbtables.cpp b/test/CodeGenCXX/microsoft-abi-vbtables.cpp
index 9cce6f8698c8f..df0689423872c 100644
--- a/test/CodeGenCXX/microsoft-abi-vbtables.cpp
+++ b/test/CodeGenCXX/microsoft-abi-vbtables.cpp
@@ -537,5 +537,5 @@ template <class> struct B : virtual A {
 
 extern template class B<int>;
 template B<int>::B();
-// CHECK-DAG: @"\01??_8?$B@H@Test30@@7B@" = external unnamed_addr constant [2 x i32]{{$}}
+// CHECK-DAG: @"\01??_8?$B@H@Test30@@7B@" = linkonce_odr unnamed_addr constant [2 x i32] [i32 0, i32 4], comdat
 }
diff --git a/test/CodeGenCXX/microsoft-abi-vftables.cpp b/test/CodeGenCXX/microsoft-abi-vftables.cpp
index 340675b188d50..0c9b58bbb4d40 100644
--- a/test/CodeGenCXX/microsoft-abi-vftables.cpp
+++ b/test/CodeGenCXX/microsoft-abi-vftables.cpp
@@ -17,9 +17,10 @@ struct __declspec(dllimport) U {
   virtual ~U();
 } u;
 
-// RTTI-DAG: @"\01??_7U@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// RTTI-DAG: [[VTABLE_U:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast ({{.*}} @"\01??_R4U@@6B@" to i8*), i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// RTTI-DAG: @"\01??_SU@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VTABLE_U]], i32 0, i32 1)
 
-// NO-RTTI-DAG: @"\01??_7U@@6B@" = available_externally dllimport unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
+// NO-RTTI-DAG: @"\01??_SU@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast ({{.*}} @"\01??_GU@@UAEPAXI@Z" to i8*)]
 
 struct __declspec(dllexport) V {
   virtual ~V();
@@ -32,7 +33,7 @@ struct __declspec(dllexport) V {
 
 namespace {
 struct W {
-  virtual ~W();
+  virtual ~W() {}
 } w;
 }
 // RTTI-DAG: [[VTABLE_W:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast ({{.*}} @"\01??_R4W@?A@@6B@" to i8*), i8* bitcast ({{.*}} @"\01??_GW@?A@@UAEPAXI@Z" to i8*)]
@@ -48,5 +49,7 @@ template <class> struct Y : virtual X {
 
 extern template class Y<int>;
 template Y<int>::Y();
-// RTTI-DAG: @"\01??_7?$Y@H@@6B@" = external unnamed_addr constant [1 x i8*]
-// NO-RTTI-DAG: @"\01??_7?$Y@H@@6B@" = external unnamed_addr constant [1 x i8*]
+// RTTI-DAG: [[VTABLE_Y:@.*]] = private unnamed_addr constant [2 x i8*] [i8* bitcast (%rtti.CompleteObjectLocator* @"\01??_R4?$Y@H@@6B@" to i8*), i8* bitcast (i8* (%struct.Y*, i32)* @"\01??_G?$Y@H@@UAEPAXI@Z" to i8*)], comdat($"\01??_7?$Y@H@@6B@")
+// RTTI-DAG: @"\01??_7?$Y@H@@6B@" = unnamed_addr alias i8*, getelementptr inbounds ([2 x i8*], [2 x i8*]* [[VTABLE_Y]], i32 0, i32 1)
+
+// NO-RTTI-DAG: @"\01??_7?$Y@H@@6B@" = linkonce_odr unnamed_addr constant [1 x i8*] [i8* bitcast (i8* (%struct.Y*, i32)* @"\01??_G?$Y@H@@UAEPAXI@Z" to i8*)], comdat
diff --git a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
index 8897a384430c8..480ae8cfbbec0 100644
--- a/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
+++ b/test/CodeGenCXX/microsoft-abi-virtual-inheritance.cpp
@@ -481,3 +481,43 @@ C::C() : B() {}
 // CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[B_i8]], i32 4
 // CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
 }
+
+namespace pr27621 {
+// Devirtualization through a static_cast used to make us compute the 'this'
+// adjustment for B::g instead of C::g. When we directly call C::g, 'this' is a
+// B*, and the prologue of C::g will adjust it to a C*.
+struct A { virtual void f(); };
+struct B { virtual void g(); };
+struct C final : A, B {
+  virtual void h();
+  void g() override;
+};
+void callit(C *p) {
+  static_cast<B*>(p)->g();
+}
+// CHECK-LABEL: define void @"\01?callit@pr27621@@YAXPAUC@1@@Z"(%"struct.pr27621::C"* %{{.*}})
+// CHECK: %[[B_i8:.*]] = getelementptr i8, i8* %{{.*}}, i32 4
+// CHECK: call x86_thiscallcc void @"\01?g@C@pr27621@@UAEXXZ"(i8* %[[B_i8]])
+}
+
+namespace test6 {
+class A {};
+class B : virtual A {};
+class C : virtual B {
+  virtual void m_fn1();
+  float field;
+};
+class D : C {
+  D();
+};
+D::D() : C() {}
+// CHECK-LABEL: define x86_thiscallcc %"class.test6::D"* @"\01??0D@test6@@AAE@XZ"(
+// CHECK:   %[[THIS:.*]] = load %"class.test6::D"*, %"class.test6::D"**
+// CHECK:   br i1 %{{.*}}, label %[[INIT_VBASES:.*]], label %[[SKIP_VBASES:.*]]
+
+// CHECK: %[[SKIP_VBASES]]
+// CHECK:   %[[C:.*]] = bitcast %"class.test6::D"* %[[THIS]] to %"class.test6::C"*
+// CHECK:   %[[C_i8:.*]] = bitcast %"class.test6::C"* %[[C]] to i8*
+// CHECK:   %[[FIELD:.*]] = getelementptr inbounds i8, i8* %[[C_i8]], i32 8
+// CHECK:   call void @llvm.memset.p0i8.i32(i8* %[[FIELD]], i8 0, i32 4, i32 4, i1 false)
+}
diff --git a/test/CodeGenCXX/microsoft-interface.cpp b/test/CodeGenCXX/microsoft-interface.cpp
index a2dfb6926d81e..8f4670a6946a9 100644
--- a/test/CodeGenCXX/microsoft-interface.cpp
+++ b/test/CodeGenCXX/microsoft-interface.cpp
@@ -31,10 +31,10 @@ int fn() {
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @_ZN1SC2Ev(%struct.S* %this)
 // CHECK:   call x86_thiscallcc void @_ZN1IC2Ev(%__interface.I* %{{[.0-9A-Z_a-z]+}})
-// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1S, i64 0, i64 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1S, i32 0, i32 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @_ZN1IC2Ev(%__interface.I* %this)
-// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1I, i64 0, i64 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
+// CHECK:   store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1I, i32 0, i32 2) to i32 (...)**), i32 (...)*** %{{[.0-9A-Z_a-z]+}}
 
 // CHECK-LABEL: define linkonce_odr x86_thiscallcc i32 @_ZN1I4testEv(%__interface.I* %this)
 // CHECK:   ret i32 1
diff --git a/test/CodeGenCXX/microsoft-templ-uuidof.cpp b/test/CodeGenCXX/microsoft-templ-uuidof.cpp
index 0ee3908638378..74d6069bbacf3 100644
--- a/test/CodeGenCXX/microsoft-templ-uuidof.cpp
+++ b/test/CodeGenCXX/microsoft-templ-uuidof.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-win32 -fms-extensions | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-win32 -fms-extensions | FileCheck %s
 
 struct _GUID;
 
diff --git a/test/CodeGenCXX/microsoft-uuidof.cpp b/test/CodeGenCXX/microsoft-uuidof.cpp
index 2ac5f1b079990..62e4b880ad46e 100644
--- a/test/CodeGenCXX/microsoft-uuidof.cpp
+++ b/test/CodeGenCXX/microsoft-uuidof.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-GUID
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-linux -fms-extensions | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-64
 // RUN: %clang_cc1 -emit-llvm %s -o - -DDEFINE_GUID -DWRONG_GUID -triple=i386-pc-linux -fms-extensions | FileCheck %s --check-prefix=CHECK-DEFINE-WRONG-GUID
 
 #ifdef DEFINE_GUID
@@ -36,6 +37,7 @@ GUID g = __uuidof(S1);
 // First global use of __uuidof(S1) forces the creation of the global.
 // CHECK: @_GUID_12345678_1234_1234_1234_1234567890ab = linkonce_odr constant { i32, i16, i16, [8 x i8] } { i32 305419896, i16 4660, i16 4660, [8 x i8] c"\124\124Vx\90\AB" }, comdat
 // CHECK: @gr = constant %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 4
+// CHECK-64: @gr = constant %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 8
 const GUID& gr = __uuidof(S1);
 
 // CHECK: @gp = global %struct._GUID* bitcast ({ i32, i16, i16, [8 x i8] }* @_GUID_12345678_1234_1234_1234_1234567890ab to %struct._GUID*), align 4
diff --git a/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp b/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
index 1ff01820bb1b3..5b245a4aaccda 100644
--- a/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
+++ b/test/CodeGenCXX/mips-size_t-ptrdiff_t.cpp
@@ -10,13 +10,13 @@ long *alloc_long() {
   return rv;
 }
 // O32-LABEL: define i32* @_Z10alloc_longv()
-// O32: call noalias i8* @_Znwj(i32 signext 4)
+// O32: call i8* @_Znwj(i32 signext 4)
 
 // N32-LABEL: define i32* @_Z10alloc_longv()
-// N32: call noalias i8* @_Znwj(i32 signext 4)
+// N32: call i8* @_Znwj(i32 signext 4)
 
 // N64-LABEL: define i64* @_Z10alloc_longv()
-// N64: call noalias i8* @_Znwm(i64 zeroext 8)
+// N64: call i8* @_Znwm(i64 zeroext 8)
 
 long *alloc_long_array() {
   long *rv = new long[2];
@@ -24,13 +24,13 @@ long *alloc_long_array() {
 }
 
 // O32-LABEL: define i32* @_Z16alloc_long_arrayv()
-// O32: call noalias i8* @_Znaj(i32 signext 8)
+// O32: call i8* @_Znaj(i32 signext 8)
 
 // N32-LABEL: define i32* @_Z16alloc_long_arrayv()
-// N32: call noalias i8* @_Znaj(i32 signext 8)
+// N32: call i8* @_Znaj(i32 signext 8)
 
 // N64-LABEL: define i64* @_Z16alloc_long_arrayv()
-// N64: call noalias i8* @_Znam(i64 zeroext 16)
+// N64: call i8* @_Znam(i64 zeroext 16)
 
 #include <stddef.h>
 
diff --git a/test/CodeGenCXX/multi-dim-operator-new.cpp b/test/CodeGenCXX/multi-dim-operator-new.cpp
index 7a235e83a78da..0dfcffb6aa40d 100644
--- a/test/CodeGenCXX/multi-dim-operator-new.cpp
+++ b/test/CodeGenCXX/multi-dim-operator-new.cpp
@@ -43,7 +43,7 @@ int main() {
  return 0;
 }
 
-// CHECK: call noalias i8* @_Znam
-// CHECK: call noalias i8* @_Znam
-// CHECK: call noalias i8* @_Znam
+// CHECK: call i8* @_Znam
+// CHECK: call i8* @_Znam
+// CHECK: call i8* @_Znam
 
diff --git a/test/CodeGenCXX/new-alias.cpp b/test/CodeGenCXX/new-alias.cpp
index 4afd942e3cdbf..b21638abbfb6c 100644
--- a/test/CodeGenCXX/new-alias.cpp
+++ b/test/CodeGenCXX/new-alias.cpp
@@ -9,5 +9,5 @@ extern "C" char *something(long long x) {
 void *operator new(size_t) __attribute__((alias("something")));
 
 // PR16715: don't assert here.
-// CHECK: call noalias i8* @_Znwm(i64 4){{$}}
+// CHECK: call i8* @_Znwm(i64 4){{$}}
 int *pr16715 = new int;
diff --git a/test/CodeGenCXX/new-array-init.cpp b/test/CodeGenCXX/new-array-init.cpp
index 6b76f471a22b4..602f93c34f012 100644
--- a/test/CodeGenCXX/new-array-init.cpp
+++ b/test/CodeGenCXX/new-array-init.cpp
@@ -14,7 +14,7 @@ void fn(int n) {
 // CHECK-LABEL: define void @_Z15const_underflowv
 void const_underflow() {
   // CHECK-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
-  // CHECK: call noalias i8* @_Zna{{.}}(i{{32|64}} -1)
+  // CHECK: call i8* @_Zna{{.}}(i{{32|64}} -1)
   new int[2] { 1, 2, 3 };
 }
 
@@ -37,7 +37,7 @@ void check_array_value_init() {
   struct S;
   new (int S::*[3][4][5]) ();
 
-  // CHECK: call noalias i8* @_Zna{{.}}(i{{32 240|64 480}})
+  // CHECK: call i8* @_Zna{{.}}(i{{32 240|64 480}})
   // CHECK: getelementptr inbounds i{{32|64}}, i{{32|64}}* {{.*}}, i{{32|64}} 60
 
   // CHECK: phi
diff --git a/test/CodeGenCXX/new-overflow.cpp b/test/CodeGenCXX/new-overflow.cpp
index 9057e049b700e..0c4c3c823d196 100644
--- a/test/CodeGenCXX/new-overflow.cpp
+++ b/test/CodeGenCXX/new-overflow.cpp
@@ -17,7 +17,7 @@ namespace test0 {
   // CHECK-NEXT: [[T1:%.*]] = extractvalue { i32, i1 } [[T0]], 1
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T3]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T3]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(short s) {
     return new elt[s];
@@ -40,7 +40,7 @@ namespace test1 {
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = mul i32 [[N]], 100
   // CHECK-NEXT: [[T4:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T4]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T4]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(short s) {
     return new elt[s];
@@ -68,7 +68,7 @@ namespace test2 {
   // CHECK-NEXT: [[T6:%.*]] = or i1 [[T1]], [[T5]]
   // CHECK-NEXT: [[T7:%.*]] = extractvalue { i32, i1 } [[T4]], 0
   // CHECK-NEXT: [[T8:%.*]] = select i1 [[T6]], i32 -1, i32 [[T7]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T8]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T8]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(short s) {
     return new elt[s];
@@ -87,7 +87,7 @@ namespace test4 {
   // CHECK:      [[N:%.*]] = sext i16 {{%.*}} to i32
   // CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[N]], 0
   // CHECK-NEXT: [[T1:%.*]] = select i1 [[T0]], i32 -1, i32 [[N]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T1]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T1]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(short s) {
     return new elt[s];
@@ -106,7 +106,7 @@ namespace test5 {
   // CHECK:      [[N:%.*]] = load i32, i32*
   // CHECK-NEXT: [[T0:%.*]] = icmp slt i32 [[N]], 0
   // CHECK-NEXT: [[T1:%.*]] = select i1 [[T0]], i32 -1, i32 [[N]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T1]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T1]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(int s) {
     return new elt[s];
@@ -128,7 +128,7 @@ namespace test6 {
   // CHECK-NEXT: [[T1:%.*]] = extractvalue { i32, i1 } [[T0]], 1
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T3]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T3]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[N]]
   elt *test(unsigned short s) {
     return new elt[s];
@@ -151,7 +151,7 @@ namespace test7 {
   // CHECK-NEXT: [[T2:%.*]] = extractvalue { i32, i1 } [[T0]], 0
   // CHECK-NEXT: [[T3:%.*]] = mul i32 [[N]], 100
   // CHECK-NEXT: [[T4:%.*]] = select i1 [[T1]], i32 -1, i32 [[T2]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T4]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T4]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T3]]
   elt *test(unsigned short s) {
     return new elt[s];
@@ -176,7 +176,7 @@ namespace test8 {
   // CHECK-NEXT: [[T4:%.*]] = or i1 [[T0]], [[T3]]
   // CHECK-NEXT: [[T5:%.*]] = extractvalue { i32, i1 } [[T2]], 0
   // CHECK-NEXT: [[T6:%.*]] = select i1 [[T4]], i32 -1, i32 [[T5]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T6]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T6]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T1]]
   elt *test(long long s) {
     return new elt[s];
@@ -201,7 +201,7 @@ namespace test9 {
   // CHECK-NEXT: [[T4:%.*]] = or i1 [[T0]], [[T3]]
   // CHECK-NEXT: [[T5:%.*]] = extractvalue { i32, i1 } [[T2]], 0
   // CHECK-NEXT: [[T6:%.*]] = select i1 [[T4]], i32 -1, i32 [[T5]]
-  // CHECK-NEXT: call noalias i8* @_Znaj(i32 [[T6]])
+  // CHECK-NEXT: call i8* @_Znaj(i32 [[T6]])
   // CHECK:      getelementptr inbounds {{.*}}, i32 [[T1]]
   elt *test(unsigned long long s) {
     return new elt[s];
diff --git a/test/CodeGenCXX/new.cpp b/test/CodeGenCXX/new.cpp
index 6d6f70138616e..ae2ec1505c5da 100644
--- a/test/CodeGenCXX/new.cpp
+++ b/test/CodeGenCXX/new.cpp
@@ -127,15 +127,15 @@ struct B { int a; };
 struct Bmemptr { int Bmemptr::* memptr; int a; };
 
 void t11(int n) {
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call void @llvm.memset.p0i8.i64(
   B* b = new B();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: {{call void.*llvm.memset.p0i8.i64.*i8 0, i64 %}}
   B *b2 = new B[n]();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
   // CHECK: br
   Bmemptr *b_memptr = new Bmemptr[n]();
@@ -148,11 +148,11 @@ struct Empty { };
 // We don't need to initialize an empty class.
 // CHECK-LABEL: define void @_Z3t12v
 void t12() {
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK-NOT: br
   (void)new Empty[10];
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK-NOT: br
   (void)new Empty[10]();
 
@@ -162,11 +162,11 @@ void t12() {
 // Zero-initialization
 // CHECK-LABEL: define void @_Z3t13i
 void t13(int n) {
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: store i32 0, i32*
   (void)new int();
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: {{call void.*llvm.memset.p0i8.i64.*i8 0, i64 %}}
   (void)new int[n]();
 
@@ -186,7 +186,7 @@ void f() {
   // CHECK: call void @_ZN5AllocD1Ev(
   // CHECK: call void @_ZN5AllocdaEPv(i8*
   delete[] new Alloc[10][20];
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call void @_ZdlPv(i8*
   delete new bool;
   // CHECK: ret void
@@ -274,7 +274,7 @@ namespace PR10197 {
   // CHECK-LABEL: define weak_odr void @_ZN7PR101971fIiEEvv()
   template<typename T>
   void f() {
-    // CHECK: [[CALL:%.*]] = call noalias i8* @_Znwm
+    // CHECK: [[CALL:%.*]] = call i8* @_Znwm
     // CHECK-NEXT: [[CASTED:%.*]] = bitcast i8* [[CALL]] to 
     new T;
     // CHECK-NEXT: ret void
@@ -296,7 +296,7 @@ namespace PR11757 {
   struct X { X(); X(const X&); };
   X* a(X* x) { return new X(X()); }
   // CHECK: define {{.*}} @_ZN7PR117571aEPNS_1XE
-  // CHECK: [[CALL:%.*]] = call noalias i8* @_Znwm
+  // CHECK: [[CALL:%.*]] = call i8* @_Znwm
   // CHECK-NEXT: [[CASTED:%.*]] = bitcast i8* [[CALL]] to
   // CHECK-NEXT: call void @_ZN7PR117571XC1Ev({{.*}}* [[CASTED]])
   // CHECK-NEXT: ret {{.*}} [[CASTED]]
@@ -306,7 +306,7 @@ namespace PR13380 {
   struct A { A() {} };
   struct B : public A { int x; };
   // CHECK-LABEL: define i8* @_ZN7PR133801fEv
-  // CHECK: call noalias i8* @_Znam(
+  // CHECK: call i8* @_Znam(
   // CHECK: call void @llvm.memset.p0i8
   // CHECK-NEXT: call void @_ZN7PR133801BC1Ev
   void* f() { return new B[2](); }
@@ -320,12 +320,12 @@ namespace N3664 {
 
   // CHECK-LABEL: define void @_ZN5N36641fEv
   void f() {
-    // CHECK: call noalias i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
+    // CHECK: call i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW:#[^ ]*]]
     int *p = new int; // expected-note {{allocated with 'new' here}}
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE:#[^ ]*]]
     delete p;
 
-    // CHECK: call noalias i8* @_Znam(i64 12) [[ATTR_BUILTIN_NEW]]
+    // CHECK: call i8* @_Znam(i64 12) [[ATTR_BUILTIN_NEW]]
     int *q = new int[3];
     // CHECK: call void @_ZdaPv({{.*}}) [[ATTR_BUILTIN_DELETE]]
     delete[] p; // expected-warning {{'delete[]' applied to a pointer that was allocated with 'new'; did you mean 'delete'?}}
@@ -337,19 +337,18 @@ namespace N3664 {
     (void) new (mpt) int;
   }
 
-  // FIXME: Can we mark this noalias?
-  // CHECK: declare i8* @_ZnamRKSt9nothrow_t(i64, {{.*}}) [[ATTR_NOBUILTIN_NOUNWIND]]
+  // CHECK: declare noalias i8* @_ZnamRKSt9nothrow_t(i64, {{.*}}) [[ATTR_NOBUILTIN_NOUNWIND]]
 
   // CHECK-LABEL: define void @_ZN5N36641gEv
   void g() {
     // It's OK for there to be attributes here, so long as we don't have a
     // 'builtin' attribute.
-    // CHECK: call noalias i8* @_Znwm(i64 4){{$}}
+    // CHECK: call i8* @_Znwm(i64 4){{$}}
     int *p = (int*)operator new(4);
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_NOUNWIND:#[^ ]*]]
     operator delete(p);
 
-    // CHECK: call noalias i8* @_Znam(i64 12){{$}}
+    // CHECK: call i8* @_Znam(i64 12){{$}}
     int *q = (int*)operator new[](12);
     // CHECK: call void @_ZdaPv({{.*}}) [[ATTR_NOUNWIND]]
     operator delete [](p);
@@ -362,7 +361,7 @@ namespace N3664 {
 namespace builtins {
   // CHECK-LABEL: define void @_ZN8builtins1fEv
   void f() {
-    // CHECK: call noalias i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW]]
+    // CHECK: call i8* @_Znwm(i64 4) [[ATTR_BUILTIN_NEW]]
     // CHECK: call void @_ZdlPv({{.*}}) [[ATTR_BUILTIN_DELETE]]
     __builtin_operator_delete(__builtin_operator_new(4));
   }
diff --git a/test/CodeGenCXX/operator-new.cpp b/test/CodeGenCXX/operator-new.cpp
index db56cda6cdd16..dc1c36d8eadb9 100644
--- a/test/CodeGenCXX/operator-new.cpp
+++ b/test/CodeGenCXX/operator-new.cpp
@@ -16,7 +16,6 @@ void f1() {
   new teste();
 }
 
-
 // rdar://5739832 - operator new should check for overflow in multiply.
 void *f2(long N) {
   return new int[N];
@@ -25,5 +24,8 @@ void *f2(long N) {
 // SANE-NEXT: [[OVER:%.*]] = extractvalue {{.*}} [[UWO]], 1
 // SANE-NEXT: [[SUM:%.*]] = extractvalue {{.*}} [[UWO]], 0
 // SANE-NEXT: [[RESULT:%.*]] = select i1 [[OVER]], i32 -1, i32 [[SUM]]
-// SANE-NEXT: call noalias i8* @_Znaj(i32 [[RESULT]])
+// SANE-NEXT: call i8* @_Znaj(i32 [[RESULT]])
 }
+
+// SANE: declare noalias i8* @_Znaj(
+// SANENOT: declare i8* @_Znaj(
diff --git a/test/CodeGenCXX/optnone-and-attributes.cpp b/test/CodeGenCXX/optnone-and-attributes.cpp
index 56173b5a36018..870d5e9496f04 100644
--- a/test/CodeGenCXX/optnone-and-attributes.cpp
+++ b/test/CodeGenCXX/optnone-and-attributes.cpp
@@ -79,4 +79,4 @@ int exported_optnone_func(int a) {
 // CHECK: attributes [[NORETURN]] = { noinline noreturn {{.*}} optnone
 
 // CHECK: attributes [[DLLIMPORT]] =
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: optnone
diff --git a/test/CodeGenCXX/optnone-class-members.cpp b/test/CodeGenCXX/optnone-class-members.cpp
index 147b821967669..751f3dd2bf08c 100644
--- a/test/CodeGenCXX/optnone-class-members.cpp
+++ b/test/CodeGenCXX/optnone-class-members.cpp
@@ -159,6 +159,6 @@ int bar() {
 
 
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: noinline
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: noinline
+// CHECK-NOT: optnone
 // CHECK: attributes [[OPTNONE]] = {{.*}} noinline {{.*}} optnone
diff --git a/test/CodeGenCXX/optnone-def-decl.cpp b/test/CodeGenCXX/optnone-def-decl.cpp
index cb3a67793647d..0240189c50891 100644
--- a/test/CodeGenCXX/optnone-def-decl.cpp
+++ b/test/CodeGenCXX/optnone-def-decl.cpp
@@ -91,5 +91,5 @@ int user_of_forceinline_optnone_function() {
 
 // CHECK: attributes [[OPTNONE]] = { noinline nounwind optnone {{.*}} }
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: noinline
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: noinline
+// CHECK-NOT: optnone
diff --git a/test/CodeGenCXX/optnone-templates.cpp b/test/CodeGenCXX/optnone-templates.cpp
index 45a72b3b1766a..9f97d832c1e25 100644
--- a/test/CodeGenCXX/optnone-templates.cpp
+++ b/test/CodeGenCXX/optnone-templates.cpp
@@ -100,5 +100,5 @@ void container3()
 
 
 // CHECK: attributes [[NORMAL]] =
-// CHECK-SAME-NOT: optnone
+// CHECK-NOT: optnone
 // CHECK: attributes [[OPTNONE]] = {{.*}} optnone
diff --git a/test/CodeGenCXX/pass-object-size.cpp b/test/CodeGenCXX/pass-object-size.cpp
index 2c7f9742a8e67..7fd8b599aaa35 100644
--- a/test/CodeGenCXX/pass-object-size.cpp
+++ b/test/CodeGenCXX/pass-object-size.cpp
@@ -43,3 +43,40 @@ void Test() {
   (&OvlFoo)(nullptr);
 }
 }
+
+namespace delegate {
+  struct A {
+    A(void *const p __attribute__((pass_object_size(0))));
+  };
+  A::A(void *const p __attribute__((pass_object_size(0)))) {}
+  // Ensure that we forward the size through a delegating constructor call.
+  // CHECK: define void @_ZN8delegate1AC1EPvU17pass_object_size0({{[^,]*}}, i8*{{[^,]*}}, i64{{[^,]*}})
+  // CHECK: call void @_ZN8delegate1AC2EPvU17pass_object_size0({{[^,]*}}, i8*{{[^,]*}}, i64{{[^,]*}})
+}
+
+namespace variadic {
+// We had an issue where variadic member/operator calls with pass_object_size
+// would cause crashes.
+
+struct AsCtor {
+  AsCtor(const char *const c __attribute__((pass_object_size(0))), double a,
+         ...) {}
+};
+
+struct AsMember {
+  void bar(const char *const c __attribute__((pass_object_size(0))), double a,
+           ...) {}
+  void operator()(const char *const c __attribute__((pass_object_size(0))),
+                  double a, ...) {}
+};
+
+// CHECK-LABEL: define void @_ZN8variadic4testEv()
+void test() {
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic6AsCtorC1EPKcU17pass_object_size0dz
+  AsCtor("a", 1.0);
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic8AsMember3barEPKcU17pass_object_size0dz
+  AsMember{}.bar("a", 1.0);
+  // CHECK-RE: call{{[^@]+}}@_ZN8variadic8AsMemberclEPKcU17pass_object_size0dz
+  AsMember{}("a", 1.0);
+}
+}
diff --git a/test/CodeGenCXX/pr20719.cpp b/test/CodeGenCXX/pr20719.cpp
index 208d111358471..1c3b21b0b84e3 100644
--- a/test/CodeGenCXX/pr20719.cpp
+++ b/test/CodeGenCXX/pr20719.cpp
@@ -2,8 +2,8 @@
 
 // Make sure that we emit H's constructor twice: once with the first lambda
 // inside of 'lep' and again with the second lambda inside of 'lep'.
-// CHECK-DAG: @"\01??0?$H@V<lambda_1>@??$lep@X@@YAXXZ@@@QAE@XZ"
-// CHECK-DAG: @"\01??0?$H@V<lambda_2>@??$lep@X@@YAXXZ@@@QAE@XZ"
+// CHECK-DAG: @"\01??0?$H@V<lambda_1>@?0???$lep@X@@YAXXZ@@@QAE@XZ"
+// CHECK-DAG: @"\01??0?$H@V<lambda_2>@?0???$lep@X@@YAXXZ@@@QAE@XZ"
 
 template <typename>
 struct H {
diff --git a/test/CodeGenCXX/pr27030.cpp b/test/CodeGenCXX/pr27030.cpp
new file mode 100644
index 0000000000000..5c24051d7a2b0
--- /dev/null
+++ b/test/CodeGenCXX/pr27030.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -triple=i386-pc-win32 %s -o - | FileCheck %s
+struct A {};
+struct B : A {};
+extern "C" {
+extern int B::*a;
+void test1() { (int A::*)(a); }
+}
+// CHECK-LABEL: define void @test1(
+// CHECK: %[[load:.*]]       = load i32, i32* @a
+// CHECK: %[[memptr_cmp:.*]] = icmp ne i32 %[[load]], -1
+// CHECK: br i1 %[[memptr_cmp]]
+
+// CHECK: %[[adj:.*]] = sub nsw i32 %[[load]], 0
+// CHECK: %[[nv_adj:.*]] = select i1 true, i32 %[[adj]], i32 0
+
+// CHECK: %[[memptr_converted:.*]] = phi i32 [ -1, {{.*}} ], [ %[[nv_adj]], {{.*}} ]
diff --git a/test/CodeGenCXX/pr28360.cpp b/test/CodeGenCXX/pr28360.cpp
new file mode 100644
index 0000000000000..5d7e1ae0c1fde
--- /dev/null
+++ b/test/CodeGenCXX/pr28360.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple i686-pc-win32 | FileCheck %s
+struct A {
+  void Foo();
+  void Foo(int);
+};
+
+using MpTy = void (A::*)();
+
+void Bar(const MpTy &);
+
+void Baz() { Bar(&A::Foo); }
+
+// CHECK-LABEL: define void @"\01?Baz@@YAXXZ"(
+// CHECK:  %[[ref_tmp:.*]] = alloca i8*, align 4
+// CHECK: store i8* bitcast (void (%struct.A*)* @"\01?Foo@A@@QAEXXZ" to i8*), i8** %[[ref_tmp]], align 4
+// CHECK: call void @"\01?Bar@@YAXABQ8A@@AEXXZ@Z"(i8** dereferenceable(4) %[[ref_tmp]])
diff --git a/test/CodeGenCXX/pragma-loop.cpp b/test/CodeGenCXX/pragma-loop.cpp
index b85e0b49b9df4..e337913646df3 100644
--- a/test/CodeGenCXX/pragma-loop.cpp
+++ b/test/CodeGenCXX/pragma-loop.cpp
@@ -9,6 +9,7 @@ void while_test(int *List, int Length) {
 #pragma clang loop interleave_count(4)
 #pragma clang loop vectorize_width(4)
 #pragma clang loop unroll(full)
+#pragma clang loop distribute(enable)
   while (i < Length) {
     // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_1:.*]]
     List[i] = i * 2;
@@ -20,7 +21,7 @@ void while_test(int *List, int Length) {
 void do_test(int *List, int Length) {
   int i = 0;
 
-#pragma clang loop vectorize_width(8) interleave_count(4) unroll(disable)
+#pragma clang loop vectorize_width(8) interleave_count(4) unroll(disable) distribute(disable)
   do {
     // CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[LOOP_2:.*]]
     List[i] = i * 2;
@@ -55,7 +56,7 @@ void for_range_test() {
 
 // Verify disable pragma clang loop directive generates correct metadata
 void disable_test(int *List, int Length) {
-#pragma clang loop vectorize(disable) unroll(disable)
+#pragma clang loop vectorize(disable) unroll(disable) distribute(disable)
   for (int i = 0; i < Length; i++) {
     // CHECK: br label {{.*}}, !llvm.loop ![[LOOP_5:.*]]
     List[i] = i * 2;
@@ -157,20 +158,22 @@ void template_test(double *List, int Length) {
   for_template_constant_expression_test<double, 2, 4, 8>(List, Length);
 }
 
-// CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[WIDTH_4:.*]], ![[INTERLEAVE_4:.*]], ![[INTENABLE_1:.*]], ![[UNROLL_FULL:.*]]}
+// CHECK: ![[LOOP_1]] = distinct !{![[LOOP_1]], ![[WIDTH_4:.*]], ![[INTERLEAVE_4:.*]], ![[INTENABLE_1:.*]], ![[UNROLL_FULL:.*]], ![[DISTRIBUTE_ENABLE:.*]]}
 // CHECK: ![[WIDTH_4]] = !{!"llvm.loop.vectorize.width", i32 4}
 // CHECK: ![[INTERLEAVE_4]] = !{!"llvm.loop.interleave.count", i32 4}
 // CHECK: ![[INTENABLE_1]] = !{!"llvm.loop.vectorize.enable", i1 true}
 // CHECK: ![[UNROLL_FULL]] = !{!"llvm.loop.unroll.full"}
-// CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[WIDTH_8:.*]], ![[INTERLEAVE_4:.*]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[DISTRIBUTE_ENABLE]] = !{!"llvm.loop.distribute.enable", i1 true}
+// CHECK: ![[LOOP_2]] = distinct !{![[LOOP_2:.*]], ![[WIDTH_8:.*]], ![[INTERLEAVE_4:.*]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]]}
 // CHECK: ![[WIDTH_8]] = !{!"llvm.loop.vectorize.width", i32 8}
 // CHECK: ![[UNROLL_DISABLE]] = !{!"llvm.loop.unroll.disable"}
+// CHECK: ![[DISTRIBUTE_DISABLE]] = !{!"llvm.loop.distribute.enable", i1 false}
 // CHECK: ![[LOOP_3]] = distinct !{![[LOOP_3]], ![[INTERLEAVE_4:.*]], ![[UNROLL_8:.*]], ![[INTENABLE_1:.*]]}
 // CHECK: ![[UNROLL_8]] = !{!"llvm.loop.unroll.count", i32 8}
 // CHECK: ![[LOOP_4]] = distinct !{![[LOOP_4]], ![[WIDTH_2:.*]], ![[INTERLEAVE_2:.*]]}
 // CHECK: ![[WIDTH_2]] = !{!"llvm.loop.vectorize.width", i32 2}
 // CHECK: ![[INTERLEAVE_2]] = !{!"llvm.loop.interleave.count", i32 2}
-// CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[WIDTH_1:.*]], ![[UNROLL_DISABLE:.*]]}
+// CHECK: ![[LOOP_5]] = distinct !{![[LOOP_5]], ![[WIDTH_1:.*]], ![[UNROLL_DISABLE:.*]], ![[DISTRIBUTE_DISABLE:.*]]}
 // CHECK: ![[WIDTH_1]] = !{!"llvm.loop.vectorize.width", i32 1}
 // CHECK: ![[LOOP_6]] = distinct !{![[LOOP_6]], ![[WIDTH_2:.*]], ![[INTERLEAVE_2:.*]], ![[UNROLL_8:.*]]}
 // CHECK: ![[LOOP_7]] = distinct !{![[LOOP_7]], ![[WIDTH_5:.*]]}
diff --git a/test/CodeGenCXX/rtti-fundamental.cpp b/test/CodeGenCXX/rtti-fundamental.cpp
index e70c3aa365976..a0ad80d7c98a6 100644
--- a/test/CodeGenCXX/rtti-fundamental.cpp
+++ b/test/CodeGenCXX/rtti-fundamental.cpp
@@ -89,6 +89,16 @@ namespace __cxxabiv1 {
 // CHECK: @_ZTIPy = constant
 // CHECK: @_ZTIPKy = constant
 
+// __int128
+// CHECK: @_ZTIn = constant
+// CHECK: @_ZTIPn = constant
+// CHECK: @_ZTIPKn = constant
+
+// unsigned __int128
+// CHECK: @_ZTIo = constant
+// CHECK: @_ZTIPo = constant
+// CHECK: @_ZTIPKo = constant
+
 // half
 // CHECK: @_ZTIDh = constant
 // CHECK: @_ZTIPDh = constant
diff --git a/test/CodeGenCXX/sections.cpp b/test/CodeGenCXX/sections.cpp
index bec2e2d3d7069..c33871a97f567 100644
--- a/test/CodeGenCXX/sections.cpp
+++ b/test/CodeGenCXX/sections.cpp
@@ -31,6 +31,31 @@ int TEST1;
 #pragma bss_seg(pop)
 int TEST2;
 
+
+// Check "save-restore" of pragma stacks.
+struct Outer {
+  void f() {
+    #pragma bss_seg(push, ".bss3")
+    #pragma code_seg(push, ".my_code1")
+    #pragma const_seg(push, ".my_const1")
+    #pragma data_seg(push, ".data3")
+    struct Inner {
+      void g() {
+        #pragma bss_seg(push, ".bss4")
+        #pragma code_seg(push, ".my_code2")
+        #pragma const_seg(push, ".my_const2")
+        #pragma data_seg(push, ".data4")
+      }
+    };
+  }
+};
+
+void h2(void) {} // should be in ".my_code"
+int TEST3; // should be in ".bss1"
+int d2 = 1; // should be in ".data"
+extern const int b2; // should be in ".my_const"
+const int b2 = 1;
+
 #pragma section("read_flag_section", read)
 // Even though they are not declared const, these become constant since they are
 // in a read-only section.
@@ -63,6 +88,9 @@ __declspec(allocate("short_section")) short short_var = 42;
 //CHECK: @i = global i32 0
 //CHECK: @TEST1 = global i32 0
 //CHECK: @TEST2 = global i32 0, section ".bss1"
+//CHECK: @TEST3 = global i32 0, section ".bss1"
+//CHECK: @d2 = global i32 1, section ".data"
+//CHECK: @b2 = constant i32 1, section ".my_const"
 //CHECK: @unreferenced = constant i32 0, section "read_flag_section"
 //CHECK: @referenced = constant i32 42, section "read_flag_section"
 //CHECK: @implicitly_read_write = global i32 42, section "no_section_attributes"
@@ -70,3 +98,4 @@ __declspec(allocate("short_section")) short short_var = 42;
 //CHECK: @short_var = global i16 42, section "short_section"
 //CHECK: define void @g()
 //CHECK: define void @h() {{.*}} section ".my_code"
+//CHECK: define void @h2() {{.*}} section ".my_code"
diff --git a/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp b/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
index f6f90986e1499..4734c02bc52e4 100644
--- a/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
+++ b/test/CodeGenCXX/skip-vtable-pointer-initialization.cpp
@@ -27,7 +27,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test21AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test21AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test21AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A() {
   f();
 }
@@ -50,7 +50,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test31AD2Ev
-// CHECK-NOT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test31AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK-NOT: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test31AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A() {
   
 }
@@ -76,7 +76,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test41AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test41AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test41AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -100,7 +100,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test51AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test51AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test51AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -128,7 +128,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test61AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test61AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test61AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -154,7 +154,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test71AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test71AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test71AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
@@ -180,7 +180,7 @@ struct A {
 };
 
 // CHECK-LABEL: define void @_ZN5Test81AD2Ev
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test81AE, i64 0, i64 2) to i32 (...)**), i32 (...)***
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5Test81AE, i32 0, i32 2) to i32 (...)**), i32 (...)***
 A::~A()
 {
 }
diff --git a/test/CodeGenCXX/stack-reuse.cpp b/test/CodeGenCXX/stack-reuse.cpp
index 473a57cda27d2..d6340ef2c10b6 100644
--- a/test/CodeGenCXX/stack-reuse.cpp
+++ b/test/CodeGenCXX/stack-reuse.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang -target armv7l-unknown-linux-gnueabihf -S %s -o - -emit-llvm -O1 -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -triple armv7-unknown-linux-gnueabihf %s -o - -emit-llvm -O1 | FileCheck %s
 
 // Stack should be reused when possible, no need to allocate two separate slots
 // if they have disjoint lifetime.
@@ -21,7 +21,7 @@ struct Combiner {
   S_large a, b;
 
   Combiner(S_large);
-  Combiner f();  
+  Combiner f();
 };
 
 extern S_small foo_small();
diff --git a/test/CodeGenCXX/static-destructor.cpp b/test/CodeGenCXX/static-destructor.cpp
new file mode 100644
index 0000000000000..0ea84f8da6fa6
--- /dev/null
+++ b/test/CodeGenCXX/static-destructor.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 %s -triple=x86_64-pc-linux -emit-llvm -o - | FileCheck --check-prefix=X86 %s
+// RUN: %clang_cc1 %s -triple=wasm32 -emit-llvm -o - | FileCheck --check-prefix=WASM %s
+// RUN: %clang_cc1 %s -triple=armv7-apple-darwin9 -emit-llvm -o - | FileCheck --check-prefix=ARM %s
+
+// Test that destructors are not passed directly to __cxa_atexit when their
+// signatures do not match the type of its first argument.
+// e.g. ARM and WebAssembly have destructors that return this instead of void.
+
+
+class Foo {
+ public:
+  ~Foo() {
+  }
+};
+
+Foo global;
+
+// X86 destructors have void return, and are registered directly with __cxa_atexit.
+// X86: define internal void @__cxx_global_var_init()
+// X86:   call i32 @__cxa_atexit(void (i8*)* bitcast (void (%class.Foo*)* @_ZN3FooD1Ev to void (i8*)*), i8* getelementptr inbounds (%class.Foo, %class.Foo* @global, i32 0, i32 0), i8* @__dso_handle)
+
+// ARM destructors return this, but can be registered directly with __cxa_atexit
+// because the calling conventions tolerate the mismatch.
+// ARM: define internal void @__cxx_global_var_init()
+// ARM:   call i32 @__cxa_atexit(void (i8*)* bitcast (%class.Foo* (%class.Foo*)* @_ZN3FooD1Ev to void (i8*)*), i8* getelementptr inbounds (%class.Foo, %class.Foo* @global, i32 0, i32 0), i8* @__dso_handle)
+
+// Wasm destructors return this, and use a wrapper function, which is registered
+// with __cxa_atexit.
+// WASM: define internal void @__cxx_global_var_init()
+// WASM: call i32 @__cxa_atexit(void (i8*)* @__cxx_global_array_dtor, i8* null, i8* @__dso_handle)
+
+// WASM: define internal void @__cxx_global_array_dtor(i8*)
+// WASM: %call = call %class.Foo* @_ZN3FooD1Ev(%class.Foo* @global)
diff --git a/test/CodeGenCXX/static-init.cpp b/test/CodeGenCXX/static-init.cpp
index 541f6416efd0a..bb974948a0595 100644
--- a/test/CodeGenCXX/static-init.cpp
+++ b/test/CodeGenCXX/static-init.cpp
@@ -26,7 +26,7 @@ void f() {
 }
 
 void g() {
-  // CHECK: call noalias i8* @_Znwm(i64 1)
+  // CHECK: call i8* @_Znwm(i64 1)
   // CHECK: call void @_ZN1AC1Ev(
   static A& a = *new A;
 }
diff --git a/test/CodeGenCXX/strict-vtable-pointers.cpp b/test/CodeGenCXX/strict-vtable-pointers.cpp
index ee3919149cabe..33f63994d7c03 100644
--- a/test/CodeGenCXX/strict-vtable-pointers.cpp
+++ b/test/CodeGenCXX/strict-vtable-pointers.cpp
@@ -166,13 +166,13 @@ struct DynamicDerivedMultiple;
 
 
 // CHECK-CTORS: %[[THIS10:.*]] = bitcast %struct.DynamicDerivedMultiple* %[[THIS0]] to i32 (...)***
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i64 0, i64 2) {{.*}} %[[THIS10]]
+// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, i32 2) {{.*}} %[[THIS10]]
 // CHECK-CTORS: %[[THIS11:.*]] = bitcast %struct.DynamicDerivedMultiple* %[[THIS0]] to i8*
 // CHECK-CTORS: %[[THIS_ADD:.*]] = getelementptr inbounds i8, i8* %[[THIS11]], i64 16
 // CHECK-CTORS: %[[THIS12:.*]]  = bitcast i8* %[[THIS_ADD]] to i32 (...)***
 
 
-// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i64 0, i64 6) {{.*}} %[[THIS12]]
+// CHECK-CTORS: store {{.*}} @_ZTV22DynamicDerivedMultiple, i32 0, i32 6) {{.*}} %[[THIS12]]
 // CHECK-CTORS-LABEL: }
 
 struct DynamicFromStatic;
diff --git a/test/CodeGenCXX/type-metadata.cpp b/test/CodeGenCXX/type-metadata.cpp
new file mode 100644
index 0000000000000..076b1fdf1808f
--- /dev/null
+++ b/test/CodeGenCXX/type-metadata.cpp
@@ -0,0 +1,248 @@
+// Tests for the cfi-vcall feature:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=NDIAG %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=ITANIUM-DIAG --check-prefix=DIAG --check-prefix=DIAG-ABORT %s
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-recover=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM --check-prefix=ITANIUM-DIAG --check-prefix=DIAG --check-prefix=DIAG-RECOVER %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-NVT --check-prefix=MS --check-prefix=TT-MS --check-prefix=NDIAG %s
+
+// Tests for the whole-program-vtables feature:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=VTABLE-OPT --check-prefix=ITANIUM --check-prefix=TT-ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=VTABLE-OPT --check-prefix=MS --check-prefix=TT-MS %s
+
+// Tests for cfi + whole-program-vtables:
+// RUN: %clang_cc1 -flto -triple x86_64-unknown-linux -fvisibility hidden -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-VT --check-prefix=ITANIUM --check-prefix=TC-ITANIUM %s
+// RUN: %clang_cc1 -flto -triple x86_64-pc-windows-msvc -fsanitize=cfi-vcall -fsanitize-trap=cfi-vcall -fwhole-program-vtables -emit-llvm -o - %s | FileCheck --check-prefix=CFI --check-prefix=CFI-VT --check-prefix=MS --check-prefix=TC-MS %s
+
+// ITANIUM: @_ZTV1A = {{[^!]*}}, !type [[A16:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL16:![0-9]+]]
+
+// ITANIUM: @_ZTV1B = {{[^!]*}}, !type [[A32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32:![0-9]+]]
+// ITANIUM-SAME: !type [[B32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTV1C = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[C32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// DIAG: @[[SRC:.*]] = private unnamed_addr constant [{{.*}} x i8] c"{{.*}}type-metadata.cpp\00", align 1
+// DIAG: @[[TYPE:.*]] = private unnamed_addr constant { i16, i16, [4 x i8] } { i16 -1, i16 0, [4 x i8] c"'A'\00" }
+// DIAG: @[[BADTYPESTATIC:.*]] = private unnamed_addr global { i8, { [{{.*}} x i8]*, i32, i32 }, { i16, i16, [4 x i8] }* } { i8 0, { [{{.*}} x i8]*, i32, i32 } { [{{.*}} x i8]* @[[SRC]], i32 123, i32 3 }, { i16, i16, [4 x i8] }* @[[TYPE]] }
+
+// ITANIUM: @_ZTVN12_GLOBAL__N_11DE = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[B32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[C88:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL88:![0-9]+]]
+// ITANIUM-SAME: !type [[D32:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTCN12_GLOBAL__N_11DE0_1B = {{[^!]*}}, !type [[A32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+// ITANIUM-SAME: !type [[B32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTCN12_GLOBAL__N_11DE8_1C = {{[^!]*}}, !type [[A64:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL64:![0-9]+]]
+// ITANIUM-SAME: !type [[C32]]
+// ITANIUM-DIAG-SAME: !type [[ALL32]]
+
+// ITANIUM: @_ZTVZ3foovE2FA = {{[^!]*}}, !type [[A16]]
+// ITANIUM-DIAG-SAME: !type [[ALL16]]
+// ITANIUM-SAME: !type [[FA16:![0-9]+]]
+// ITANIUM-DIAG-SAME: !type [[ALL16]]
+
+// MS: comdat($"\01??_7A@@6B@"), !type [[A8:![0-9]+]]
+// MS: comdat($"\01??_7B@@6B0@@"), !type [[B8:![0-9]+]]
+// MS: comdat($"\01??_7B@@6BA@@@"), !type [[A8]]
+// MS: comdat($"\01??_7C@@6B@"), !type [[A8]]
+// MS: comdat($"\01??_7D@?A@@6BB@@@"), !type [[B8]], !type [[D8:![0-9]+]]
+// MS: comdat($"\01??_7D@?A@@6BA@@@"), !type [[A8]]
+// MS: comdat($"\01??_7FA@?1??foo@@YAXXZ@6B@"), !type [[A8]], !type [[FA8:![0-9]+]]
+
+struct A {
+  A();
+  virtual void f();
+};
+
+struct B : virtual A {
+  B();
+  virtual void g();
+  virtual void h();
+};
+
+struct C : virtual A {
+  C();
+};
+
+namespace {
+
+struct D : B, C {
+  D();
+  virtual void f();
+  virtual void h();
+};
+
+}
+
+A::A() {}
+B::B() {}
+C::C() {}
+D::D() {}
+
+void A::f() {
+}
+
+void B::g() {
+}
+
+void D::f() {
+}
+
+void D::h() {
+}
+
+// ITANIUM: define hidden void @_Z2afP1A
+// MS: define void @"\01?af@@YAXPEAUA@@@Z"
+void af(A *a) {
+  // TT-ITANIUM: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT:%[^ ]*]], metadata !"_ZTS1A")
+  // TT-MS: [[P:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT:%[^ ]*]], metadata !"?AUA@@")
+  // TC-ITANIUM: [[PAIR:%[^ ]*]] = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"_ZTS1A")
+  // TC-MS: [[PAIR:%[^ ]*]] = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@@")
+  // CFI-VT: [[P:%[^ ]*]] = extractvalue { i8*, i1 } [[PAIR]], 1
+  // DIAG-NEXT: [[VTVALID0:%[^ ]*]] = call i1 @llvm.type.test(i8* [[VT]], metadata !"all-vtables")
+  // VTABLE-OPT: call void @llvm.assume(i1 [[P]])
+  // CFI-NEXT: br i1 [[P]], label %[[CONTBB:[^ ,]*]], label %[[TRAPBB:[^ ,]*]]
+  // CFI-NEXT: {{^$}}
+
+  // CFI: [[TRAPBB]]
+  // NDIAG-NEXT: call void @llvm.trap()
+  // NDIAG-NEXT: unreachable
+  // DIAG-NEXT: [[VTINT:%[^ ]*]] = ptrtoint i8* [[VT]] to i64
+  // DIAG-NEXT: [[VTVALID:%[^ ]*]] = zext i1 [[VTVALID0]] to i64
+  // DIAG-ABORT-NEXT: call void @__ubsan_handle_cfi_check_fail_abort(i8* getelementptr inbounds ({{.*}} @[[BADTYPESTATIC]], i32 0, i32 0), i64 [[VTINT]], i64 [[VTVALID]])
+  // DIAG-ABORT-NEXT: unreachable
+  // DIAG-RECOVER-NEXT: call void @__ubsan_handle_cfi_check_fail(i8* getelementptr inbounds ({{.*}} @[[BADTYPESTATIC]], i32 0, i32 0), i64 [[VTINT]], i64 [[VTVALID]])
+  // DIAG-RECOVER-NEXT: br label %[[CONTBB]]
+
+  // CFI: [[CONTBB]]
+  // CFI-NVT: [[PTR:%[^ ]*]] = load
+  // CFI-VT: [[PTRI8:%[^ ]*]] = extractvalue { i8*, i1 } [[PAIR]], 0
+  // CFI-VT: [[PTR:%[^ ]*]] = bitcast i8* [[PTRI8]] to
+  // CFI: call void [[PTR]]
+#line 123
+  a->f();
+}
+
+// ITANIUM: define internal void @_Z3df1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df1@@YAXPEAUD@?A@@@Z"
+void df1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUA@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata ![[DTYPE:[0-9]+]])
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@@")
+  d->f();
+}
+
+// ITANIUM: define internal void @_Z3dg1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?dg1@@YAXPEAUD@?A@@@Z"
+void dg1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTS1B")
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUB@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata !"_ZTS1B")
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUB@@")
+  d->g();
+}
+
+// ITANIUM: define internal void @_Z3dh1PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?dh1@@YAXPEAUD@?A@@@Z"
+void dh1(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE]])
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata ![[DTYPE:[0-9]+]])
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 16, metadata ![[DTYPE]])
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata ![[DTYPE:[0-9]+]])
+  d->h();
+}
+
+// ITANIUM: define internal void @_Z3df2PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df2@@YAXPEAUD@?A@@@Z"
+__attribute__((no_sanitize("cfi")))
+void df2(D *d) {
+  // CFI-NVT-NOT: call i1 @llvm.type.test
+  // CFI-VT: [[P:%[^ ]*]] = call i1 @llvm.type.test
+  // CFI-VT: call void @llvm.assume(i1 [[P]])
+  d->f();
+}
+
+// ITANIUM: define internal void @_Z3df3PN12_GLOBAL__N_11DE
+// MS: define internal void @"\01?df3@@YAXPEAUD@?A@@@Z"
+__attribute__((no_sanitize("address"))) __attribute__((no_sanitize("cfi-vcall")))
+void df3(D *d) {
+  // CFI-NVT-NOT: call i1 @llvm.type.test
+  // CFI-VT: [[P:%[^ ]*]] = call i1 @llvm.type.test
+  // CFI-VT: call void @llvm.assume(i1 [[P]])
+  d->f();
+}
+
+D d;
+
+void foo() {
+  df1(&d);
+  dg1(&d);
+  dh1(&d);
+  df2(&d);
+  df3(&d);
+
+  struct FA : A {
+    void f() {}
+  } fa;
+  af(&fa);
+}
+
+namespace test2 {
+
+struct A {
+  virtual void m_fn1();
+};
+struct B {
+  virtual void m_fn2();
+};
+struct C : B, A {};
+struct D : C {
+  void m_fn1();
+};
+
+// ITANIUM: define hidden void @_ZN5test21fEPNS_1DE
+// MS: define void @"\01?f@test2@@YAXPEAUD@1@@Z"
+void f(D *d) {
+  // TT-ITANIUM: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"_ZTSN5test21DE")
+  // TT-MS: {{%[^ ]*}} = call i1 @llvm.type.test(i8* {{%[^ ]*}}, metadata !"?AUA@test2@@")
+  // TC-ITANIUM: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 8, metadata !"_ZTSN5test21DE")
+  // TC-MS: {{%[^ ]*}} = call { i8*, i1 } @llvm.type.checked.load(i8* {{%[^ ]*}}, i32 0, metadata !"?AUA@test2@@")
+  d->m_fn1();
+}
+
+}
+
+// ITANIUM: [[A16]] = !{i64 16, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL16]] = !{i64 16, !"all-vtables"}
+// ITANIUM: [[A32]] = !{i64 32, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL32]] = !{i64 32, !"all-vtables"}
+// ITANIUM: [[B32]] = !{i64 32, !"_ZTS1B"}
+// ITANIUM: [[C32]] = !{i64 32, !"_ZTS1C"}
+// ITANIUM: [[C88]] = !{i64 88, !"_ZTS1C"}
+// ITANIUM-DIAG: [[ALL88]] = !{i64 88, !"all-vtables"}
+// ITANIUM: [[D32]] = !{i64 32, [[D_ID:![0-9]+]]}
+// ITANIUM: [[D_ID]] = distinct !{}
+// ITANIUM: [[A64]] = !{i64 64, !"_ZTS1A"}
+// ITANIUM-DIAG: [[ALL64]] = !{i64 64, !"all-vtables"}
+// ITANIUM: [[FA16]] = !{i64 16, [[FA_ID:![0-9]+]]}
+// ITANIUM: [[FA_ID]] = distinct !{}
+
+// MS: [[A8]] = !{i64 8, !"?AUA@@"}
+// MS: [[B8]] = !{i64 8, !"?AUB@@"}
+// MS: [[D8]] = !{i64 8, [[D_ID:![0-9]+]]}
+// MS: [[D_ID]] = distinct !{}
+// MS: [[FA8]] = !{i64 8, [[FA_ID:![0-9]+]]}
+// MS: [[FA_ID]] = distinct !{}
diff --git a/test/CodeGenCXX/virtual-base-ctor.cpp b/test/CodeGenCXX/virtual-base-ctor.cpp
index 20a88cd371050..8c8c310421dc5 100644
--- a/test/CodeGenCXX/virtual-base-ctor.cpp
+++ b/test/CodeGenCXX/virtual-base-ctor.cpp
@@ -8,4 +8,4 @@ struct A { int a; A() { y = ((size_t)this - (size_t)&x) / sizeof(void*); } };
 struct B : virtual A { void* x; };    
 B x;
 
-// CHECK: @y = global i8 2
+// CHECK: @y = local_unnamed_addr global i8 2
diff --git a/test/CodeGenCXX/virtual-function-attrs.cpp b/test/CodeGenCXX/virtual-function-attrs.cpp
new file mode 100644
index 0000000000000..3a9a1a28ddf44
--- /dev/null
+++ b/test/CodeGenCXX/virtual-function-attrs.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -triple %itanium_abi_triple -std=c++11 -emit-llvm -o - | FileCheck %s
+
+class A {
+  virtual void f();
+  virtual void g();
+  virtual ~A();
+};
+
+void A::f() {}
+
+// CHECK: define [[CC:(x86_thiscallcc )?]]void @_ZN1A1fEv({{.*}}) unnamed_addr
+// CHECK: declare [[CC]]void @_ZN1A1gEv({{.*}}) unnamed_addr
+// CHECK: declare {{.*}} @_ZN1AD1Ev({{.*}}) unnamed_addr
+// CHECK: declare [[CC]]void @_ZN1AD0Ev({{.*}}) unnamed_addr
diff --git a/test/CodeGenCXX/vla-consruct.cpp b/test/CodeGenCXX/vla-consruct.cpp
new file mode 100644
index 0000000000000..fd8314a5d7160
--- /dev/null
+++ b/test/CodeGenCXX/vla-consruct.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -O0 %s -emit-llvm -o - | FileCheck %s
+
+extern "C" int printf(const char *, ...);
+
+static int N;
+struct S {
+  S()
+  __attribute__((nothrow)) { printf("%d: S()\n", ++N); }
+  ~S() __attribute__((nothrow)) { printf("%d: ~S()\n", N--); }
+  int n[17];
+};
+// CHECK: [[struct_S:%.+]] = type { [17 x i32] }
+void print(int n, int a, int b, int c, int d) {
+  printf("n=%d\n,sizeof(S)=%d\nsizeof(array_t[0][0])=%d\nsizeof(array_t[0])=%d\nsizeof(array_t)=%d\n",
+         n, a, b, c, d);
+  if (n == 2)
+    throw(n);
+}
+
+void test(int n) {
+  // CHECK: define void {{.*test.*}}(i32 [[n:%.+]]) #
+  // CHECK: [[n_addr:%.+]] = alloca
+  // CHECK-NEXT: [[saved_stack:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_S:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t_0_0:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t_0:%.+]] = alloca
+  // CHECK-NEXT: [[sizeof_array_t:%.+]] = alloca
+  // CHECK-NEXT: [[exn_slot:%.+]] = alloca i8*
+  // CHECK-NEXT: [[ehselector_slot:%.+]] = alloca i32
+  // CHECK-NEXT: store i32 [[n]], i32* [[n_addr]]
+  // CHECK-NEXT: [[t0:%.+]] = load i32, i32* [[n_addr]]
+  // CHECK-NEXT: [[t1:%.+]] = zext i32 [[t0]] to i64
+  // CHECK-NEXT: [[t2:%.+]] = load i32, i32* [[n_addr]]
+  // CHECK-NEXT: [[add:%.+]] = add nsw i32 [[t2]], 1
+  // CHECK-NEXT: [[t3:%.+]] = zext i32 [[add]] to i64
+  // CHECK-NEXT: [[t4:%.+]] = call i8* @llvm.stacksave()
+  // CHECK-NEXT: store i8* [[t4]], i8** [[saved_stack]]
+  // CHECK-NEXT: [[t5:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  // CHECK-NEXT: [[vla:%.+]] = alloca [[struct_S]], i64 [[t5]]
+  // CHECK-NEXT: [[t6:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  // CHECK-NEXT: [[isempty:%.+]] = icmp eq i64 [[t6]], 0
+  // CHECK-NEXT: br i1 [[isempty]], label %[[arrayctor_cont:.+]], label %[[new_ctorloop:.+]]
+
+  S array_t[n][n + 1];
+
+  // CHECK: [[new_ctorloop]]
+  // CHECK-NEXT: [[arrayctor_end:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t6]]
+  // CHECK-NEXT: br label %[[arrayctor_loop:.+]]
+
+  // CHECK: [[arrayctor_loop]]
+  // CHECK-NEXT: [[arrayctor_cur:%.+]] = phi [[struct_S]]* [ [[vla]], %[[new_ctorloop]] ], [ [[arrayctor_next:%.+]], %[[arrayctor_loop]] ]
+  // CHECK-NEXT: call void [[ctor:@.+]]([[struct_S]]* [[arrayctor_cur]])
+  // CHECK-NEXT: [[arrayctor_next]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arrayctor_cur]], i64 1
+  // CHECK-NEXT: [[arrayctor_done:%.+]] = icmp eq [[struct_S]]* [[arrayctor_next]], [[arrayctor_end]]
+  // CHECK-NEXT: br i1 [[arrayctor_done]], label %[[arrayctor_cont]], label %[[arrayctor_loop]]
+
+  int sizeof_S = sizeof(S);
+  int sizeof_array_t_0_0 = sizeof(array_t[0][0]);
+  int sizeof_array_t_0 = sizeof(array_t[0]);
+  int sizeof_array_t = sizeof(array_t);
+  print(n, sizeof_S, sizeof_array_t_0_0, sizeof_array_t_0, sizeof_array_t);
+
+  //  CHECK: [[arrayctor_cont]]
+  //  CHECK-NEXT: store i32 68, i32* [[sizeof_S]]
+  //  CHECK-NEXT: store i32 68, i32* [[sizeof_array_t_0_0]]
+  //  CHECK: [[t8:%.+]] = mul nuw i64 68, [[t3]]
+  //  CHECK-NEXT: [[conv:%.+]] = trunc i64 [[t8]] to i32
+  //  CHECK-NEXT: store i32 [[conv]], i32* [[sizeof_array_t_0]]
+  //  CHECK-NEXT: [[t9:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t10:%.+]] = mul nuw i64 68, [[t9]]
+  //  CHECK-NEXT: [[conv1:%.+]] = trunc i64 [[t10]] to i32
+  //  CHECK-NEXT: store i32 [[conv1]], i32* [[sizeof_array_t]]
+  //  CHECK-NEXT: [[t11:%.+]] = load i32, i32* [[n_addr:%.+]]
+  //  CHECK-NEXT: [[t12:%.+]] = load i32, i32* [[sizeof_S]]
+  //  CHECK-NEXT: [[t13:%.+]] = load i32, i32* [[sizeof_array_t_0_0]]
+  //  CHECK-NEXT: [[t14:%.+]] = load i32, i32* [[sizeof_array_t_0]]
+  //  CHECK-NEXT: [[t15:%.+]] = load i32, i32* [[sizeof_array_t]]
+  //  CHECK-NEXT: invoke void @{{.*print.*}}(i32 [[t11]], i32 [[t12]], i32 [[t13]], i32 [[t14]], i32 [[t15]])
+  //  CHECK-NEXT: to label %[[invoke_cont:.+]] unwind label %[[lpad:.+]]
+
+  //  CHECK: [[invoke_cont]]
+  //  CHECK-NEXT: [[t16:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t17:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t16]]
+  //  CHECK-NEXT: [[arraydestroy_isempty:%.+]] = icmp eq [[struct_S]]* [[vla]], [[t17]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_isempty]], label %[[arraydestroy_done2:.+]], label %[[arraydestroy_body:.+]]
+
+  //  CHECK: [[arraydestroy_body]]
+  //  CHECK-NEXT: [[arraydestroy_elementPast:%.+]] = phi [[struct_S]]* [ [[t17]], %[[invoke_cont]] ], [ [[arraydestroy_element:%.+]], %[[arraydestroy_body]] ]
+  //  CHECK-NEXT: [[arraydestroy_element]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arraydestroy_elementPast]]
+  //  CHECK-NEXT: call void @[[dtor:.+]]([[struct_S]]* [[arraydestroy_element]])
+  //  CHECK-NEXT: [[arraydestroy_done:%.+]] = icmp eq [[struct_S]]* [[arraydestroy_element]], [[vla]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_done]], label %[[arraydestroy_done2]], label %[[arraydestroy_body]]
+
+  //  CHECK: [[arraydestroy_done2]]
+  //  CHECK-NEXT: [[t17:%.+]] = load i8*, i8** [[saved_stack]]
+  //  CHECK-NEXT: call void @llvm.stackrestore(i8* [[t17]])
+  //  CHECK: ret void
+
+  //  CHECK: [[lpad]]
+  //  CHECK-NEXT: [[t19:%.+]] = landingpad { i8*, i32 }
+  //  CHECK: [[t20:%.+]] = extractvalue { i8*, i32 } [[t19]], 0
+  //  CHECK-NEXT: store i8* [[t20]], i8** [[exn_slot]]
+  //  CHECK-NEXT: [[t21:%.+]] = extractvalue { i8*, i32 } [[t19]], 1
+  //  CHECK-NEXT: store i32 [[t21]], i32* [[ehselector_slot]]
+  //  CHECK-NEXT: [[t22:%.+]] = mul nuw i64 [[t1]], [[t3]]
+  //  CHECK-NEXT: [[t23:%.+]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[vla]], i64 [[t22]]
+  //  CHECK-NEXT: [[arraydestroy_isempty3:%.+]] = icmp eq [[struct_S]]* [[vla]], [[t23]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_isempty3]], label %[[arraydestroy_done8:.+]], label %[[arraydestroy_body4:.+]]
+
+  //  CHECK: [[arraydestroy_body4]]
+  //  CHECK: [[arraydestroy_elementPast5:%.+]] = phi [[struct_S]]* [ [[t23]], %[[lpad]] ], [ [[arraydestroy_element6:.+]], %[[arraydestroy_body4]] ]
+  //  CHECK-NEXT: [[arraydestroy_element6]] = getelementptr inbounds [[struct_S]], [[struct_S]]* [[arraydestroy_elementPast5]], i64 -1
+  //  CHECK-NEXT: call void @[[dtor]]([[struct_S]]* [[arraydestroy_element6]])
+  //  CHECK-NEXT: [[arraydestroy_done7:%.+]] = icmp eq [[struct_S]]* [[arraydestroy_element6]], [[vla]]
+  //  CHECK-NEXT: br i1 [[arraydestroy_done7]], label %[[arraydestroy_done8]], label %[[arraydestroy_body4]]
+
+  //  CHECK: [[arraydestroy_done8]]
+  //  CHECK-NEXT: br label %[[eh_resume:.+]]
+
+  //  CHECK: [[eh_resume]]
+  //  CHECK-NEXT: [[exn:%.+]] = load i8*, i8** [[exn_slot]]
+  //  CHECK-NEXT: [[sel:%.+]] = load i32, i32* [[ehselector_slot]]
+  //  CHECK-NEXT: [[lpad_val:%.+]] = insertvalue { i8*, i32 } undef, i8* [[exn]], 0
+  //  CHECK-NEXT: [[lpad_val9:%.+]] = insertvalue { i8*, i32 } [[lpad_val]], i32 [[sel]], 1
+  //  CHECK-NEXT: resume { i8*, i32 } [[lpad_val9]]
+}
+
+int main() {
+  try {
+    test(2);
+  } catch (int e) {
+    printf("expeption %d\n", e);
+  }
+  try {
+    test(3);
+  } catch (int e) {
+    printf("expeption %d", e);
+  }
+}
diff --git a/test/CodeGenCXX/vtable-assume-load.cpp b/test/CodeGenCXX/vtable-assume-load.cpp
index 30cfc00ec7ba7..819b09d70e3df 100644
--- a/test/CodeGenCXX/vtable-assume-load.cpp
+++ b/test/CodeGenCXX/vtable-assume-load.cpp
@@ -27,7 +27,7 @@ void g(A *a) { a->foo(); }
 // CHECK1-LABEL: define void @_ZN5test14fooAEv()
 // CHECK1: call void @_ZN5test11AC1Ev(%"struct.test1::A"*
 // CHECK1: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11AE, i64 0, i64 2)
+// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11AE, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: }
 
@@ -39,7 +39,7 @@ void fooA() {
 // CHECK1-LABEL: define void @_ZN5test14fooBEv()
 // CHECK1: call void @_ZN5test11BC1Ev(%"struct.test1::B"* %{{.*}})
 // CHECK1: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11BE, i64 0, i64 2)
+// CHECK1: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTVN5test11BE, i32 0, i32 2)
 // CHECK1: call void @llvm.assume(i1 %[[CMP]])
 // CHECK1-LABEL: }
 
@@ -73,14 +73,14 @@ void h(B *b) { b->bar(); }
 // CHECK2-LABEL: define void @_ZN5test24testEv()
 // CHECK2: call void @_ZN5test21CC1Ev(%"struct.test2::C"*
 // CHECK2: %[[VTABLE:.*]] = load i8**, i8*** {{.*}}
-// CHECK2: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i64 0, i64 2)
+// CHECK2: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i32 0, i32 2)
 // CHECK2: call void @llvm.assume(i1 %[[CMP]])
 
 // CHECK2: %[[V2:.*]] = bitcast %"struct.test2::C"* %{{.*}} to i8*
 // CHECK2: %[[ADD_PTR:.*]] = getelementptr inbounds i8, i8* %[[V2]], i64 8
 // CHECK2: %[[V3:.*]] = bitcast i8* %[[ADD_PTR]] to i8***
 // CHECK2: %[[VTABLE2:.*]] = load i8**, i8*** %[[V3]]
-// CHECK2: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i64 0, i64 5)
+// CHECK2: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([6 x i8*], [6 x i8*]* @_ZTVN5test21CE, i32 0, i32 5)
 // CHECK2: call void @llvm.assume(i1 %[[CMP2]])
 
 // CHECK2: call void @_ZN5test21gEPNS_1AE(
@@ -111,7 +111,7 @@ void g(B *a) { a->foo(); }
 
 // CHECK3-LABEL: define void @_ZN5test34testEv()
 // CHECK3: call void @_ZN5test31CC1Ev(%"struct.test3::C"*
-// CHECK3: %[[CMP:.*]] = icmp eq i8** %{{.*}}, getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5test31CE, i64 0, i64 3)
+// CHECK3: %[[CMP:.*]] = icmp eq i8** %{{.*}}, getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5test31CE, i32 0, i32 3)
 // CHECK3: call void @llvm.assume(i1 %[[CMP]])
 // CHECK3-LABLEL: }
 void test() {
@@ -140,11 +140,11 @@ void g(C *c) { c->foo(); }
 // CHECK4-LABEL: define void @_ZN5test44testEv()
 // CHECK4: call void @_ZN5test41CC1Ev(%"struct.test4::C"*
 // CHECK4: %[[VTABLE:.*]] = load i8**, i8*** %{{.*}}
-// CHECK4: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i64 0, i64 4)
+// CHECK4: %[[CMP:.*]] = icmp eq i8** %[[VTABLE]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP]]
 
 // CHECK4: %[[VTABLE2:.*]] = load i8**, i8*** %{{.*}}
-// CHECK4: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i64 0, i64 4)
+// CHECK4: %[[CMP2:.*]] = icmp eq i8** %[[VTABLE2]], getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5test41CE, i32 0, i32 4)
 // CHECK4: call void @llvm.assume(i1 %[[CMP2]])
 // CHECK4-LABEL: }
 
diff --git a/test/CodeGenCXX/vtable-key-function-arm.cpp b/test/CodeGenCXX/vtable-key-function-arm.cpp
index 6f1265b6277d4..3d5c3c3865ead 100644
--- a/test/CodeGenCXX/vtable-key-function-arm.cpp
+++ b/test/CodeGenCXX/vtable-key-function-arm.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 %s -triple=armv7-unknown-unknown -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -triple=armv7-unknown-unknown -emit-llvm -o - | FileCheck -check-prefix=CHECK-LATE %s
 
-// The 'a' variants ask for the v-table first.
-// The 'b' variants ask for the v-table second.
-// The 'c' variants ask for the v-table third.
+// The 'a' variants ask for the vtable first.
+// The 'b' variants ask for the vtable second.
+// The 'c' variants ask for the vtable third.
 // We do a separate CHECK-LATE pass because the RTTI definition gets
 // changed after the fact, which causes reordering of the globals.
 
diff --git a/test/CodeGenCXX/vtable-key-function-ios.cpp b/test/CodeGenCXX/vtable-key-function-ios.cpp
index d17aa695d2b5f..8a3466beda442 100644
--- a/test/CodeGenCXX/vtable-key-function-ios.cpp
+++ b/test/CodeGenCXX/vtable-key-function-ios.cpp
@@ -4,9 +4,9 @@
 // RUN: %clang_cc1 %s -triple=x86_64-pc-windows-gnu -emit-llvm -o - | FileCheck %s
 // RUN: %clang_cc1 %s -triple=x86_64-pc-windows-gnu -emit-llvm -o - | FileCheck -check-prefix=CHECK-LATE %s
 
-// The 'a' variants ask for the v-table first.
-// The 'b' variants ask for the v-table second.
-// The 'c' variants ask for the v-table third.
+// The 'a' variants ask for the vtable first.
+// The 'b' variants ask for the vtable second.
+// The 'c' variants ask for the vtable third.
 // We do a separate CHECK-LATE pass because the RTTI definition gets
 // changed after the fact, which causes reordering of the globals.
 
diff --git a/test/CodeGenCXX/vtable-linkage.cpp b/test/CodeGenCXX/vtable-linkage.cpp
index ff398ffa61dd1..0b556d10c3e8f 100644
--- a/test/CodeGenCXX/vtable-linkage.cpp
+++ b/test/CodeGenCXX/vtable-linkage.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -triple=x86_64-pc-linux -emit-llvm -o %t
 // RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -disable-llvm-optzns -O3 -emit-llvm -o %t.opt
-// RUN: FileCheck --check-prefix=CHECK %s < %t
+// RUN: FileCheck %s < %t
 // RUN: FileCheck --check-prefix=CHECK-OPT %s < %t.opt
 
 namespace {
diff --git a/test/CodeGenCXX/vtable-pointer-initialization.cpp b/test/CodeGenCXX/vtable-pointer-initialization.cpp
index 2854291e2965c..130a55cf62182 100644
--- a/test/CodeGenCXX/vtable-pointer-initialization.cpp
+++ b/test/CodeGenCXX/vtable-pointer-initialization.cpp
@@ -21,13 +21,13 @@ struct A : Base {
 
 // CHECK-LABEL: define void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldC1Ev(
 // CHECK: ret void
 A::A() { }
 
 // CHECK-LABEL: define void @_ZN1AD2Ev(%struct.A* %this) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1A, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
@@ -49,12 +49,12 @@ void f() { B b; }
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BC2Ev(%struct.B* %this) unnamed_addr
 // CHECK: call void @_ZN4BaseC2Ev(
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldC1Ev
 // CHECK: ret void
 
 // CHECK-LABEL: define linkonce_odr void @_ZN1BD2Ev(%struct.B* %this) unnamed_addr
-// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i64 0, i64 2) to i32 (...)**)
+// CHECK: store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*], [3 x i8*]* @_ZTV1B, i32 0, i32 2) to i32 (...)**)
 // CHECK: call void @_ZN5FieldD1Ev(
 // CHECK: call void @_ZN4BaseD2Ev(
 // CHECK: ret void
diff --git a/test/CodeGenCXX/vtt-layout.cpp b/test/CodeGenCXX/vtt-layout.cpp
index 2f441ff7e18a5..7ff93dd451c9d 100644
--- a/test/CodeGenCXX/vtt-layout.cpp
+++ b/test/CodeGenCXX/vtt-layout.cpp
@@ -78,9 +78,11 @@ namespace Test6 {
   }
 }
 
-// CHECK: @_ZTTN5Test11BE = unnamed_addr constant [1 x i8*] [i8* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5Test11BE, i64 0, i64 3) to i8*)]
+// CHECK: @_ZTTN5Test11BE = unnamed_addr constant [1 x i8*] [i8* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTVN5Test11BE, i32 0, i32 3) to i8*)]
 // CHECK: @_ZTVN5Test51AE = unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTIN5Test51AE to i8*), i8* bitcast (void ()* @__cxa_pure_virtual to i8*), i8* bitcast (void (%"struct.Test5::A"*)* @_ZN5Test51A6anchorEv to i8*)]
 // CHECK: @_ZTVN5Test61AE = unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTIN5Test61AE to i8*), i8* bitcast (void ()* @__cxa_deleted_virtual to i8*), i8* bitcast (void (%"struct.Test6::A"*)* @_ZN5Test61A6anchorEv to i8*)]
-// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x i8*] [i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i64 0, i64 4) to i8*)] 
-// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x i8*] [i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 5) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 0, i64 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 1, i64 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i64 0, i64 6) to i8*)] 
-// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x i8*] [i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i64 0, i64 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 12) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i64 0, i64 18) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 17) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 20) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 0, i64 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 1, i64 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i64 0, i64 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i64 0, i64 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i64 0, i64 10) to i8*)] 
+// CHECK: @_ZTTN5Test21CE = linkonce_odr unnamed_addr constant [2 x i8*] [i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([5 x i8*], [5 x i8*]* @_ZTVN5Test21CE, i32 0, i32 4) to i8*)] 
+// CHECK: @_ZTTN5Test31DE = linkonce_odr unnamed_addr constant [13 x i8*] [i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 5) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE0_NS_2C1E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([14 x i8*], [14 x i8*]* @_ZTCN5Test31DE16_NS_2C2E, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i32 0, i32 11) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTVN5Test31DE, i64 1, i32 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test31DE64_NS_2V2E, i32 0, i32 6) to i8*)] 
+// CHECK: @_ZTTN5Test41DE = linkonce_odr unnamed_addr constant [19 x i8*] [i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE0_NS_2C1E, i32 0, i32 10) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 12) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 15) to i8*), i8* bitcast (i8** getelementptr inbounds ([19 x i8*], [19 x i8*]* @_ZTCN5Test41DE16_NS_2C2E, i32 0, i32 18) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 17) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 20) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i32 0, i32 13) to i8*), i8* bitcast (i8** getelementptr inbounds ([25 x i8*], [25 x i8*]* @_ZTVN5Test41DE, i64 1, i32 0) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 3) to i8*), i8* bitcast (i8** getelementptr inbounds ([7 x i8*], [7 x i8*]* @_ZTCN5Test41DE40_NS_2V1E, i32 0, i32 6) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 4) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 7) to i8*), i8* bitcast (i8** getelementptr inbounds ([11 x i8*], [11 x i8*]* @_ZTCN5Test41DE72_NS_2V2E, i32 0, i32 10) to i8*)] 
+// CHECK: declare void @__cxa_pure_virtual() unnamed_addr
+// CHECK: declare void @__cxa_deleted_virtual() unnamed_addr
diff --git a/test/CodeGenObjC/2009-08-05-utf16.m b/test/CodeGenObjC/2009-08-05-utf16.m
index 18ac1db79e24d..92394d9383eb5 100644
--- a/test/CodeGenObjC/2009-08-05-utf16.m
+++ b/test/CodeGenObjC/2009-08-05-utf16.m
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -emit-llvm -w -x objective-c %s -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm -w -x objective-c %s -o - | FileCheck %s
 // rdar://7095855 rdar://7115749
 
 // CHECK: private unnamed_addr constant [6 x i16] [i16 105, i16 80, i16 111, i16 100, i16 8482, i16 0], section "__TEXT,__ustring", align 2
diff --git a/test/CodeGenObjC/2010-02-01-utf16-with-null.m b/test/CodeGenObjC/2010-02-01-utf16-with-null.m
index 46ce3b289fc1a..7c103f2ba6d45 100644
--- a/test/CodeGenObjC/2010-02-01-utf16-with-null.m
+++ b/test/CodeGenObjC/2010-02-01-utf16-with-null.m
@@ -2,6 +2,6 @@
 // rdar://7589850
 
 // CHECK: @.str = private unnamed_addr constant [9 x i16] [i16 103, i16 111, i16 111, i16 100, i16 0, i16 98, i16 121, i16 101, i16 0], section "__TEXT,__ustring", align 2
-// CHECK: @_unnamed_cfstring_ = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([9 x i16]* @.str to i8*), i32 8 }, section "__DATA,__cfstring"
-// CHECK: @P = global i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), align 4
+// CHECK: @_unnamed_cfstring_ = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 2000, i8* bitcast ([9 x i16]* @.str to i8*), i32 8 }, section "__DATA,__cfstring"
+// CHECK: @P = global i8* bitcast (%struct.__NSConstantString_tag* @_unnamed_cfstring_ to i8*), align 4
 void *P = @"good\0bye";
diff --git a/test/CodeGenObjC/arc-foreach.m b/test/CodeGenObjC/arc-foreach.m
index 90d9c1f126173..db150e88a59f5 100644
--- a/test/CodeGenObjC/arc-foreach.m
+++ b/test/CodeGenObjC/arc-foreach.m
@@ -170,4 +170,55 @@ void test3(NSArray *array) {
   // CHECK-LP64-NEXT: br label [[L]]
 }
 
+@interface NSObject @end
+
+@interface I1 : NSObject
+- (NSArray *) foo1:(void (^)(void))block;
+- (void) foo2;
+@end
+
+NSArray *array4;
+
+@implementation I1 : NSObject
+- (NSArray *) foo1:(void (^)(void))block {
+  block();
+  return array4;
+}
+
+- (void) foo2 {
+  for (id x in [self foo1:^{ use(self); }]) {
+    use(x);
+    break;
+  }
+}
+@end
+
+// CHECK-LP64-LABEL: define internal void @"\01-[I1 foo2]"(
+// CHECK-LP64:         [[SELF_ADDR:%.*]] = alloca [[TY:%.*]]*,
+// CHECK-LP64:         [[BLOCK:%.*]] = alloca <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [[TY]]* }>,
+// CHECK-LP64:         store [[TY]]* %self, [[TY]]** [[SELF_ADDR]]
+// CHECK-LP64:         [[T0:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [[TY]]* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [[TY]]* }>* [[BLOCK]], i32 0, i32 5
+// CHECK-LP64:         [[BC:%.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [[TY]]* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, [[TY]]* }>* [[BLOCK]], i32 0, i32 5
+// CHECK-LP64:         [[T1:%.*]] = load [[TY]]*, [[TY]]** [[SELF_ADDR]]
+// CHECK-LP64:         [[T2:%.*]] = bitcast [[TY]]* [[T1]] to i8*
+// CHECK-LP64:         [[T3:%.*]] = call i8* @objc_retain(i8* [[T2]])
+// CHECK-LP64:         [[T4:%.*]] = bitcast i8* [[T3]] to [[TY]]*
+// CHECK-LP64:         store [[TY]]* [[T4]], [[TY]]** [[BC]]
+
+// CHECK-LP64:         [[T5:%.*]] = bitcast [[TY]]** [[T0]] to i8**
+// CHECK-LP64:         call void @objc_storeStrong(i8** [[T5]], i8* null)
+// CHECK-LP64:         switch i32 {{%.*}}, label %[[UNREACHABLE:.*]] [
+// CHECK-LP64-NEXT:      i32 0, label %[[CLEANUP_CONT:.*]]
+// CHECK-LP64-NEXT:      i32 2, label %[[FORCOLL_END:.*]]
+// CHECK-LP64-NEXT:    ]
+
+// CHECK-LP64:       {{^|:}}[[CLEANUP_CONT]]
+// CHECK-LP64-NEXT:    br label %[[FORCOLL_END]]
+
+// CHECK-LP64:       {{^|:}}[[FORCOLL_END]]
+// CHECK-LP64-NEXT:    ret void
+
+// CHECK-LP64:       {{^|:}}[[UNREACHABLE]]
+// CHECK-LP64-NEXT:    unreachable
+
 // CHECK-LP64: attributes [[NUW]] = { nounwind }
diff --git a/test/CodeGenObjC/arc-i386.m b/test/CodeGenObjC/arc-i386.m
new file mode 100644
index 0000000000000..7693a8f2b6ff3
--- /dev/null
+++ b/test/CodeGenObjC/arc-i386.m
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple i386-apple-darwin10 -emit-llvm -fblocks -fobjc-arc -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-apple-iossimulator6.0 -emit-llvm -fblocks -fobjc-arc -o - %s | FileCheck %s
+
+// <rdar://24531556>: implement objc_retainAutoreleasedReturnValue on i386
+
+// CHECK-LABEL: define i8* @test0()
+id test0(void) {
+  extern id test0_helper(void);
+  // CHECK:      [[T0:%.*]] = call i8* @test0_helper()
+  // CHECK-NEXT: ret i8* [[T0]]
+  return test0_helper();
+}
+
+// CHECK-LABEL: define void @test1()
+void test1(void) {
+  extern id test1_helper(void);
+  // CHECK:      [[T0:%.*]] = call i8* @test1_helper()
+  // CHECK-NEXT: call void asm sideeffect "mov
+  // CHECK-NEXT: [[T1:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T0]])
+  // CHECK-NEXT: store i8* [[T1]],
+  // CHECK-NEXT: call void @objc_storeStrong(
+  // CHECK-NEXT: ret void
+  id x = test1_helper();
+}
+
+// rdar://problem/12133032
+// CHECK-LABEL: define {{.*}} @test2()
+@class A;
+A *test2(void) {
+  extern A *test2_helper(void);
+  // CHECK:      [[T0:%.*]] = call [[A:%.*]]* @test2_helper()
+  // CHECK-NEXT: ret [[A]]* [[T0]]
+  return test2_helper();
+}
+
+// CHECK-LABEL: define i8* @test3()
+id test3(void) {
+  extern A *test3_helper(void);
+  // CHECK:      [[T0:%.*]] = call [[A]]* @test3_helper()
+  // CHECK-NEXT: [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+  // CHECK-NEXT: ret i8* [[T1]]
+  return test3_helper();
+}
diff --git a/test/CodeGenObjC/arc-no-arc-exceptions.m b/test/CodeGenObjC/arc-no-arc-exceptions.m
index 82977b0a17513..f147b64c4841c 100644
--- a/test/CodeGenObjC/arc-no-arc-exceptions.m
+++ b/test/CodeGenObjC/arc-no-arc-exceptions.m
@@ -34,7 +34,7 @@ void test1(id x) {
 void NSLog(id, ...);
 
 // CHECK-LABEL: define void @test2(
-// CHECK: invoke void (i8*, ...) @NSLog(i8* bitcast (%struct.NSConstantString* @_unnamed_cfstring_ to i8*), i32* %{{.*}})
+// CHECK: invoke void (i8*, ...) @NSLog(i8* bitcast (%struct.__NSConstantString_tag* @_unnamed_cfstring_ to i8*), i32* %{{.*}})
 // CHECK:   to label %{{.*}} unwind label %{{.*}}, !clang.arc.no_objc_arc_exceptions !
 // NO-METADATA-LABEL: define void @test2(
 // NO-METADATA-NOT: !clang.arc.no_objc_arc_exceptions
diff --git a/test/CodeGenObjC/arc-unsafeclaim.m b/test/CodeGenObjC/arc-unsafeclaim.m
new file mode 100644
index 0000000000000..cda00b0a2f5e0
--- /dev/null
+++ b/test/CodeGenObjC/arc-unsafeclaim.m
@@ -0,0 +1,231 @@
+//   Make sure it works on x86-64.
+// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -fobjc-runtime=macosx-10.11 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED
+
+//   Make sure it works on ARM.
+// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED
+// RUN: %clang_cc1 -triple arm64-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-optzns -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED
+
+//   Make sure it works on ARM64.
+// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNOPTIMIZED -check-prefix=CHECK-MARKED
+// RUN: %clang_cc1 -triple armv7-apple-ios9 -fobjc-runtime=ios-9.0 -fobjc-arc -O -disable-llvm-optzns -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-OPTIMIZED
+
+//   Make sure that it's implicitly disabled if the runtime version isn't high enough.
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-10.10 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=DISABLED
+// RUN: %clang_cc1 -triple arm64-apple-ios8 -fobjc-runtime=ios-8 -fobjc-arc -emit-llvm -o - %s | FileCheck %s -check-prefix=DISABLED -check-prefix=DISABLED-MARKED
+
+@class A;
+
+A *makeA(void);
+
+void test_assign() {
+  __unsafe_unretained id x;
+  x = makeA();
+}
+// CHECK-LABEL:        define void @test_assign()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A:.*]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT:           ret void
+
+// DISABLED-LABEL:     define void @test_assign()
+// DISABLED:             [[T0:%.*]] = call [[A:.*]]* @makeA()
+// DISABLED-MARKED-NEXT: call void asm sideeffect
+// DISABLED-NEXT:        [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// DISABLED-NEXT:        [[T2:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T1]])
+
+void test_assign_assign() {
+  __unsafe_unretained id x, y;
+  x = y = makeA();
+}
+// CHECK-LABEL:        define void @test_assign_assign()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT:           ret void
+
+void test_strong_assign_assign() {
+  __strong id x;
+  __unsafe_unretained id y;
+  x = y = makeA();
+}
+// CHECK-LABEL:        define void @test_strong_assign_assign()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-NEXT:           [[OLD:%.*]] = load i8*, i8** [[X]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-NEXT:           call void @objc_release(i8* [[OLD]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-UNOPTIMIZED-NEXT: call void @objc_storeStrong(i8** [[X]], i8* null)
+// CHECK-OPTIMIZED-NEXT: [[T0:%.*]] = load i8*, i8** [[X]]
+// CHECK-OPTIMIZED-NEXT: call void @objc_release(i8* [[T0]])
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT:           ret void
+
+void test_assign_strong_assign() {
+  __unsafe_unretained id x;
+  __strong id y;
+  x = y = makeA();
+}
+// CHECK-LABEL:        define void @test_assign_strong_assign()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           [[OLD:%.*]] = load i8*, i8** [[Y]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-NEXT:           call void @objc_release(i8* [[OLD]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-UNOPTIMIZED-NEXT: call void @objc_storeStrong(i8** [[Y]], i8* null)
+// CHECK-OPTIMIZED-NEXT: [[T0:%.*]] = load i8*, i8** [[Y]]
+// CHECK-OPTIMIZED-NEXT: call void @objc_release(i8* [[T0]])
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT:           ret void
+
+void test_init() {
+  __unsafe_unretained id x = makeA();
+}
+// CHECK-LABEL:        define void @test_init()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT:           ret void
+
+void test_init_assignment() {
+  __unsafe_unretained id x;
+  __unsafe_unretained id y = x = makeA();
+}
+// CHECK-LABEL:        define void @test_init_assignment()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT: ret void
+
+void test_strong_init_assignment() {
+  __unsafe_unretained id x;
+  __strong id y = x = makeA();
+}
+// CHECK-LABEL:        define void @test_strong_init_assignment()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-UNOPTIMIZED-NEXT: call void @objc_storeStrong(i8** [[Y]], i8* null)
+// CHECK-OPTIMIZED-NEXT: [[T0:%.*]] = load i8*, i8** [[Y]]
+// CHECK-OPTIMIZED-NEXT: call void @objc_release(i8* [[T0]])
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT: ret void
+
+void test_init_strong_assignment() {
+  __strong id x;
+  __unsafe_unretained id y = x = makeA();
+}
+// CHECK-LABEL:        define void @test_init_strong_assignment()
+// CHECK:                [[X:%.*]] = alloca i8*
+// CHECK:                [[Y:%.*]] = alloca i8*
+// CHECK:                [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT:    call void asm sideeffect
+// CHECK-NEXT:           [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:           [[T2:%.*]] = call i8* @objc_retainAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:           [[T3:%.*]] = bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:           [[T4:%.*]] = bitcast [[A]]* [[T3]] to i8*
+// CHECK-NEXT:           [[OLD:%.*]] = load i8*, i8** [[X]]
+// CHECK-NEXT:           store i8* [[T4]], i8** [[X]]
+// CHECK-NEXT:           call void @objc_release(i8* [[OLD]])
+// CHECK-NEXT:           store i8* [[T4]], i8** [[Y]]
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-UNOPTIMIZED-NEXT: call void @objc_storeStrong(i8** [[X]], i8* null)
+// CHECK-OPTIMIZED-NEXT: [[T0:%.*]] = load i8*, i8** [[X]]
+// CHECK-OPTIMIZED-NEXT: call void @objc_release(i8* [[T0]])
+// CHECK-OPTIMIZED-NEXT: bitcast
+// CHECK-OPTIMIZED-NEXT: lifetime.end
+// CHECK-NEXT: ret void
+
+void test_ignored() {
+  makeA();
+}
+// CHECK-LABEL:     define void @test_ignored()
+// CHECK:             [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT: call void asm sideeffect
+// CHECK-NEXT:        [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:        [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:        bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:        ret void
+
+void test_cast_to_void() {
+  (void) makeA();
+}
+// CHECK-LABEL:     define void @test_cast_to_void()
+// CHECK:             [[T0:%.*]] = call [[A]]* @makeA()
+// CHECK-MARKED-NEXT: call void asm sideeffect
+// CHECK-NEXT:        [[T1:%.*]] = bitcast [[A]]* [[T0]] to i8*
+// CHECK-NEXT:        [[T2:%.*]] = call i8* @objc_unsafeClaimAutoreleasedReturnValue(i8* [[T1]])
+// CHECK-NEXT:        bitcast i8* [[T2]] to [[A]]*
+// CHECK-NEXT:        ret void
+
+
+
+// This is always at the end of the module.
+
+// CHECK-OPTIMIZED: !clang.arc.retainAutoreleasedReturnValueMarker = !{!0}
diff --git a/test/CodeGenObjC/attr-objc-runtime-visible.m b/test/CodeGenObjC/attr-objc-runtime-visible.m
new file mode 100644
index 0000000000000..6e224e7189037
--- /dev/null
+++ b/test/CodeGenObjC/attr-objc-runtime-visible.m
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.9.0 -emit-llvm %s -o - | FileCheck %s
+
+// RUN: %clang_cc1 -triple i386-apple-darwin -fobjc-runtime=macosx-fragile-10.9.0 -emit-llvm %s -o - | FileCheck %s
+
+@interface Root
++(Class)class;
+@end
+
+__attribute__((objc_runtime_visible))
+__attribute__((objc_runtime_name("MyRuntimeVisibleClass")))
+@interface A : Root
+@end
+
+// CHECK: [[CLASSNAME:@.*]] = private unnamed_addr constant [22 x i8] c"MyRuntimeVisibleClass
+// CHECK: define i8* @getClass() #0 {
+Class getClass(void) {
+  // CHECK: call i8* @objc_lookUpClass(i8* getelementptr inbounds ([22 x i8], [22 x i8]* [[CLASSNAME]], i32 0, i32 0)) #2
+  return [A class];
+}
diff --git a/test/CodeGenObjC/constant-strings.m b/test/CodeGenObjC/constant-strings.m
index 0a65496373580..a1daa9284b81c 100644
--- a/test/CodeGenObjC/constant-strings.m
+++ b/test/CodeGenObjC/constant-strings.m
@@ -1,15 +1,17 @@
-// RUN: %clang_cc1 -emit-llvm -o %t %s
+// REQUIRES: x86-registered-target
+
+// RUN: %clang_cc1 -triple x86_64-macho -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-NEXT < %t %s
 
 // Check that we set alignment 1 on the string.
 //
 // CHECK-NEXT: @.str = {{.*}}constant [13 x i8] c"Hello World!\00", section "__TEXT,__cstring,cstring_literals", align 1
 
-// RUN: %clang_cc1 -fobjc-runtime=gcc -emit-llvm -o %t %s
+// RUN: %clang_cc1 -triple x86_64-macho -fobjc-runtime=gcc -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-GNU < %t %s
 // CHECK-GNU: NXConstantString
 
-// RUN: %clang_cc1 -fobjc-runtime=gcc -fconstant-string-class NSConstantString -emit-llvm -o %t %s
+// RUN: %clang_cc1 -triple x86_64-macho -fobjc-runtime=gcc -fconstant-string-class NSConstantString -emit-llvm -o %t %s
 // RUN: FileCheck --check-prefix=CHECK-GNU-WITH-CLASS < %t %s
 // CHECK-GNU-WITH-CLASS: NSConstantString
 id a = @"Hello World!";
diff --git a/test/CodeGenObjC/debug-info-block-type.m b/test/CodeGenObjC/debug-info-block-type.m
index 1f137ed9dfc22..565bc86e8b8ac 100644
--- a/test/CodeGenObjC/debug-info-block-type.m
+++ b/test/CodeGenObjC/debug-info-block-type.m
@@ -1,18 +1,17 @@
 // RUN: %clang_cc1 -emit-llvm -fblocks -debug-info-kind=limited  -triple x86_64-apple-darwin14 -x objective-c < %s -o - | FileCheck %s
-#define nil ((void*) 0)
-typedef signed char BOOL;
-// CHECK: ![[BOOL:[0-9]+]] = !DIDerivedType(tag: DW_TAG_typedef, name: "BOOL"
-// CHECK-SAME:                              line: [[@LINE-2]]
-// CHECK: ![[ID:[0-9]+]] = !DIDerivedType(tag: DW_TAG_typedef, name: "id"
-
-typedef BOOL (^SomeKindOfPredicate)(id obj);
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "__FuncPtr"
 // CHECK-SAME:           baseType: ![[PTR:[0-9]+]]
 // CHECK: ![[PTR]] = !DIDerivedType(tag: DW_TAG_pointer_type,
 // CHECK-SAME:                      baseType: ![[FNTYPE:[0-9]+]]
 // CHECK: ![[FNTYPE]] = !DISubroutineType(types: ![[ARGS:[0-9]+]])
-// CHECK: ![[ARGS]] = !{![[BOOL]], ![[ID]]}
+// CHECK: ![[ARGS]] = !{![[BOOL:.*]], ![[ID:.*]]}
+#define nil ((void*) 0)
+typedef signed char BOOL;
+// CHECK: ![[BOOL]] = !DIDerivedType(tag: DW_TAG_typedef, name: "BOOL"
+// CHECK-SAME:                              line: [[@LINE-2]]
+// CHECK: ![[ID]] = !DIDerivedType(tag: DW_TAG_typedef, name: "id"
 
+typedef BOOL (^SomeKindOfPredicate)(id obj);
 int main()
 {
   SomeKindOfPredicate p = ^BOOL(id obj) { return obj != nil; };
diff --git a/test/CodeGenObjC/debug-info-nodebug.m b/test/CodeGenObjC/debug-info-nodebug.m
new file mode 100644
index 0000000000000..42d630b4ac1d8
--- /dev/null
+++ b/test/CodeGenObjC/debug-info-nodebug.m
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple arm-apple-ios -emit-llvm -debug-info-kind=limited -fblocks  %s -o - | FileCheck %s
+// Objective-C code cargo-culted from debug-info-lifetime-crash.m.
+@protocol NSObject
+- (id)copy;
+@end
+@class W;
+@interface View1
+@end
+@implementation Controller {
+    void (^Block)(void);
+}
+- (void)View:(View1 *)View foo:(W *)W
+{
+  // The reference from inside the block implicitly creates another
+  // local variable for the referenced member. That is what gets
+  // suppressed by the attribute.  It still gets debug info as a
+  // member, though.
+  // CHECK-NOT: !DILocalVariable(name: "weakSelf"
+  // CHECK:     !DIDerivedType({{.*}} name: "weakSelf"
+  // CHECK-NOT: !DILocalVariable(name: "weakSelf"
+  __attribute__((nodebug)) __typeof(self) weakSelf = self;
+  Block = [^{
+    __typeof(self) strongSelf = weakSelf;
+    } copy];
+}
+@end
diff --git a/test/CodeGenObjC/debug-property-synth.m b/test/CodeGenObjC/debug-property-synth.m
index 74ee775f751a4..45bf77067c869 100644
--- a/test/CodeGenObjC/debug-property-synth.m
+++ b/test/CodeGenObjC/debug-property-synth.m
@@ -19,8 +19,8 @@
 // CHECK: load {{.*}}, !dbg ![[DBG2:[0-9]+]]
 //
 // CHECK: !DISubprogram(name: "-[I p1]",{{.*}} line: [[@LINE+4]],{{.*}} isLocal: true, isDefinition: true
-// CHECK: !DISubprogram(name: "-[I setP1:]",{{.*}} line: [[@LINE+3]],{{.*}} isLocal: true, isDefinition: true
-// CHECK: ![[DBG1]] = !DILocation(line: [[@LINE+2]],
+// CHECK: ![[DBG1]] = !DILocation(line: [[@LINE+3]],
+// CHECK: !DISubprogram(name: "-[I setP1:]",{{.*}} line: [[@LINE+2]],{{.*}} isLocal: true, isDefinition: true
 // CHECK: ![[DBG2]] = !DILocation(line: [[@LINE+1]],
 @property int p1;
 @end
diff --git a/test/CodeGenObjC/dllstorage.m b/test/CodeGenObjC/dllstorage.m
new file mode 100644
index 0000000000000..4bdbd509151b6
--- /dev/null
+++ b/test/CodeGenObjC/dllstorage.m
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=macosx -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -check-prefix CHECK-IR %s
+// RUN: %clang_cc1 -triple i686-windows-itanium -fms-extensions -fobjc-runtime=objfw -fdeclspec -fobjc-exceptions -S -emit-llvm -o - %s | FileCheck -check-prefix CHECK-FW %s
+
+// CHECK-IR-DAG: @_objc_empty_cache = external dllimport global %struct._objc_cache
+
+__declspec(dllimport)
+@interface I
++ (instancetype) new;
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_I" = external dllimport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_I" = external dllimport global %struct._class_t
+
+__declspec(dllexport)
+@interface J : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_J" = dllexport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_J" = dllexport global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_J = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_J = dllexport global
+
+@implementation J {
+  id _ivar;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_J._ivar" = global i32
+
+@interface K : J
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_K" = global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_K" = global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_K = global
+// CHECK-FW-DAG: @_OBJC_CLASS_K = global
+
+@implementation K {
+  id _ivar;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_K._ivar" = global i32
+
+__declspec(dllexport)
+@interface L : K
+@end
+
+// CHECK-IR-DAG: @"OBJC_METACLASS_$_L" = dllexport global %struct._class_t
+// CHECK-IR-DAG: @"OBJC_CLASS_$_L" = dllexport global %struct._class_t
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_L = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_L = dllexport global
+
+@implementation L {
+  id _none;
+
+  @public
+  id _public;
+
+  @protected
+  id _protected;
+
+  @package
+  id _package;
+
+  @private
+  id _private;
+}
+@end
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._none" = global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._public" = dllexport global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._protected" = dllexport global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._package" = global i32
+// CHECK-IR-DAG: @"OBJC_IVAR_$_L._private" = global i32
+
+__declspec(dllimport)
+@interface M : I {
+  @public
+  id _ivar;
+}
+@end
+
+// CHEKC-FW-DAG: @_OBJC_CLASS_M = external dllimport global i32
+
+// CHECK-IR-DAG: @"OBJC_IVAR_$_M._ivar" = external dllimport global i32
+
+__declspec(dllexport)
+__attribute__((__objc_exception__))
+@interface N : I
+@end
+
+// CHECK-FW-DAG: @_OBJC_METACLASS_N = dllexport global
+// CHECK-FW-DAG: @_OBJC_CLASS_N = dllexport global
+
+@implementation N : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_N" = dllexport global %struct._objc_typeinfo
+
+__declspec(dllimport)
+__attribute__((__objc_exception__))
+@interface O : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_O" = external dllimport global %struct._objc_typeinfo
+
+__attribute__((__objc_exception__))
+@interface P : I
+@end
+
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_P" = external global %struct._objc_typeinfo
+
+int g() {
+  @autoreleasepool {
+    M *mi = [M new];
+    @try {
+      mi->_ivar = (void *)0;
+      @throw(@"CFConstantString");
+    } @catch (id) {
+      return 1;
+    } @catch (I *) {
+      return 2;
+    } @catch (J *) {
+      return 3;
+    } @catch (K *) {
+      return 4;
+    } @catch (L *) {
+      return 5;
+    } @catch (M *) {
+      return 6;
+    } @catch (N *) {
+      return 7;
+    } @catch (O *) {
+      return 8;
+    } @catch (P *) {
+      return 9;
+    }
+  }
+  return 0;
+}
+
+// CHECK-IR-DAG: @OBJC_EHTYPE_id = external dllimport global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_I" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_K" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_L" = weak global %struct._objc_typeinfo
+// CHECK-IR-DAG: @"OBJC_EHTYPE_$_M" = weak global %struct._objc_typeinfo
+
diff --git a/test/CodeGenObjC/exceptions-asm-attribute.m b/test/CodeGenObjC/exceptions-asm-attribute.m
index c5ef46724b5c5..5719198f7962f 100644
--- a/test/CodeGenObjC/exceptions-asm-attribute.m
+++ b/test/CodeGenObjC/exceptions-asm-attribute.m
@@ -12,7 +12,7 @@
 // CHECK-X86_64: @"OBJC_CLASS_$_MySecretNamespace.A" = global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64: @"OBJC_METACLASS_$_MySecretNamespace.A" = global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64: @OBJC_CLASS_NAME_ = {{.*}}, section "__TEXT,__objc_classname,cstring_literals", align 1
-// CHECK-X86_64: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak global {{.*}}, section "__DATA,__datacoal_nt,coalesced", align 8
+// CHECK-X86_64: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak global {{.*}}, align 8
 // CHECK-X86_64: @"OBJC_EHTYPE_$_MySecretNamespace.EH2" = external global
 // CHECK-X86_64: @"OBJC_EHTYPE_$_MySecretNamespace.EH3" = global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64: @"OBJC_LABEL_CLASS_$" = private global {{.*}}, section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
@@ -24,7 +24,7 @@
 
 // CHECK-X86_64-HIDDEN: @"OBJC_CLASS_$_MySecretNamespace.A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64-HIDDEN: @"OBJC_METACLASS_$_MySecretNamespace.A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
-// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak hidden global {{.*}}, section "__DATA,__datacoal_nt,coalesced"
+// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak hidden global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH2" = external global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_MySecretNamespace.EH3" = hidden global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64-HIDDEN: define internal void @"\01-[A im0]"
@@ -36,7 +36,7 @@
 // CHECK-ARMV6: @"OBJC_CLASS_$_MySecretNamespace.A" = global {{.*}}, section "__DATA, __objc_data", align 4
 // CHECK-ARMV6: @"OBJC_METACLASS_$_MySecretNamespace.A" = global {{.*}}, section "__DATA, __objc_data", align 4
 // CHECK-ARMV6: @OBJC_CLASS_NAME_ = {{.*}}, section "__TEXT,__objc_classname,cstring_literals", align 1
-// CHECK-ARMV6: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak global {{.*}}, section "__DATA,__datacoal_nt,coalesced", align 4
+// CHECK-ARMV6: @"OBJC_EHTYPE_$_MySecretNamespace.EH1" = weak global {{.*}}, align 4
 // CHECK-ARMV6: @"OBJC_EHTYPE_$_MySecretNamespace.EH2" = external global
 // CHECK-ARMV6: @"OBJC_EHTYPE_$_MySecretNamespace.EH3" = global {{.*}}, section "__DATA,__objc_const", align 4
 // CHECK-ARMV6: @"OBJC_LABEL_CLASS_$" = private global {{.*}}, section "__DATA, __objc_classlist, regular, no_dead_strip", align 4
diff --git a/test/CodeGenObjC/hidden-visibility.m b/test/CodeGenObjC/hidden-visibility.m
index 9f5071d5ffaa9..cb23ca18f81fc 100644
--- a/test/CodeGenObjC/hidden-visibility.m
+++ b/test/CodeGenObjC/hidden-visibility.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fvisibility hidden -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple i386-apple-macosx -fvisibility hidden -emit-llvm -o - %s | FileCheck %s
 // CHECK: @"OBJC_IVAR_$_I.P" = hidden
 // CHECK: @"OBJC_CLASS_$_I" = hidden
 // CHECK: @"OBJC_METACLASS_$_I" = hidden
diff --git a/test/CodeGenObjC/messages-2.m b/test/CodeGenObjC/messages-2.m
index 4f98fc7287df9..be66f71f66f71 100644
--- a/test/CodeGenObjC/messages-2.m
+++ b/test/CodeGenObjC/messages-2.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fobjc-runtime=macosx-fragile-10.5 -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NF
 
 // Most of this test is apparently just verifying that we don't crash.
diff --git a/test/CodeGenObjC/metadata-class-properties.m b/test/CodeGenObjC/metadata-class-properties.m
new file mode 100644
index 0000000000000..58841bc1230f0
--- /dev/null
+++ b/test/CodeGenObjC/metadata-class-properties.m
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10  -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NULL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11  -emit-llvm -o - -fobjc-runtime=macosx-fragile-10.5 %s | FileCheck -check-prefix=CHECK-FRAGILE %s
+
+// CHECK: @"\01l_OBJC_$_CLASS_PROP_LIST_Proto" = private global {{.*}} section "__DATA, __objc_const", align 8
+// CHECK: @"\01l_OBJC_PROTOCOL_$_Proto" = {{.*}} global %struct._protocol_t { {{.*}} i32 96, i32 {{.*}} @"\01l_OBJC_$_CLASS_PROP_LIST_Proto" {{.*}} }
+// CHECK: @"\01l_OBJC_$_CLASS_PROP_LIST_Foo_$_Category" = private global {{.*}} section "__DATA, __objc_const", align 8
+// CHECK: @"\01l_OBJC_$_CATEGORY_Foo_$_Category" = private global %struct._category_t { {{.*}} @"\01l_OBJC_$_CLASS_PROP_LIST_Foo_$_Category" {{.*}}, i32 64 }, section "__DATA, __objc_const", align 8
+
+// CHECK: @"\01l_OBJC_$_CLASS_PROP_LIST_C" = private global {{.*}} section "__DATA, __objc_const", align 8
+// CHECK: @"\01l_OBJC_METACLASS_RO_$_C" = private global %struct._class_ro_t { {{.*}} @"\01l_OBJC_$_CLASS_PROP_LIST_C" {{.*}} }, section "__DATA, __objc_const", align 8
+
+// CHECK: !{i32 1, !"Objective-C Class Properties", i32 64}
+
+// CHECK-NULL-NOT: @"\01l_OBJC_$_CLASS_PROP_LIST_Proto"
+// CHECK-NULL: @"\01l_OBJC_PROTOCOL_$_Proto" = {{.*}} global %struct._protocol_t { {{.*}} %struct._prop_list_t* null, i32 96, i32 {{.*}} %struct._prop_list_t* null }
+// CHECK-NULL-NOT: @"\01l_OBJC_$_CLASS_PROP_LIST_Foo_$_Category" = private global {{.*}} section "__DATA, __objc_const", align 8
+// CHECK-NULL: @"\01l_OBJC_$_CATEGORY_Foo_$_Category" = private global %struct._category_t { {{.*}} %struct._prop_list_t* null, %struct._prop_list_t* null, {{.*}} }, section "__DATA, __objc_const", align 8
+
+// CHECK-NULL-NOT: @"\01l_OBJC_$_CLASS_PROP_LIST_C" = private global {{.*}} section "__DATA, __objc_const", align 8
+// CHECK-NULL: @"\01l_OBJC_METACLASS_RO_$_C" = private global %struct._class_ro_t { {{.*}} %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+
+// CHECK-NULL: !{i32 1, !"Objective-C Class Properties", i32 64}
+
+// CHECK-FRAGILE: @"OBJC_$_CLASS_PROP_PROTO_LIST_Proto" = private global {{.*}} section "__OBJC,__property,regular,no_dead_strip", align 8
+// CHECK-FRAGILE: @"\01l_OBJC_PROTOCOLEXT_Proto" = private global %struct._objc_protocol_extension { i32 48, {{.*}} @"OBJC_$_CLASS_PROP_PROTO_LIST_Proto" {{.*}} }, align 8
+// CHECK-FRAGILE: @"\01l_OBJC_$_CLASS_PROP_LIST_Foo_Category" = private global {{.*}} section "__OBJC,__property,regular,no_dead_strip", align 8
+// CHECK-FRAGILE: @OBJC_CATEGORY_Foo_Category = private global %struct._objc_category { {{.*}}, i32 64, {{.*}} @"\01l_OBJC_$_CLASS_PROP_LIST_Foo_Category" {{.*}} }, section "__OBJC,__category,regular,no_dead_strip", align 8
+
+// CHECK-FRAGILE: @"\01l_OBJC_$_CLASS_PROP_LIST_C" = private global {{.*}} section "__OBJC,__property,regular,no_dead_strip", align 8
+// CHECK-FRAGILE: @OBJC_CLASSEXT_C = private global %struct._objc_class_extension { {{.*}} @"\01l_OBJC_$_CLASS_PROP_LIST_C" {{.*}} }, section "__OBJC,__class_ext,regular,no_dead_strip", align 8
+
+// CHECK-FRAGILE: !{i32 1, !"Objective-C Class Properties", i32 64}
+
+@interface Foo @end
+
+@protocol Proto
+@property (class, readonly) int proto_property;
+@end
+
+@interface Foo (Category) <Proto> @end
+
+@implementation Foo (Category)
++(int)proto_property { return 0; }
+@end
+
+__attribute__((objc_root_class))
+@interface C
+@property(class, readonly) int p;
+@end
+@implementation C
++(int)p { return 1; }
+@end
diff --git a/test/CodeGenObjC/metadata-symbols-64.m b/test/CodeGenObjC/metadata-symbols-64.m
index 5b9591ff973f1..9b7423746388c 100644
--- a/test/CodeGenObjC/metadata-symbols-64.m
+++ b/test/CodeGenObjC/metadata-symbols-64.m
@@ -11,7 +11,7 @@
 // CHECK: @"\01l_OBJC_$_CLASS_METHODS_A" = private global {{.*}} section "__DATA, __objc_const", align 8
 // CHECK: @"\01l_OBJC_$_PROTOCOL_INSTANCE_METHODS_P" = private global {{.*}} section "__DATA, __objc_const", align 8
 // CHECK: @"\01l_OBJC_$_PROTOCOL_CLASS_METHODS_P" = private global {{.*}} section "__DATA, __objc_const", align 8
-// CHECK: @"\01l_OBJC_PROTOCOL_$_P" = weak hidden global {{.*}} section "__DATA,__datacoal_nt,coalesced", align 8
+// CHECK: @"\01l_OBJC_PROTOCOL_$_P" = weak hidden global {{.*}}, align 8
 // CHECK: @"\01l_OBJC_LABEL_PROTOCOL_$_P" = weak hidden global {{.*}} section "__DATA, __objc_protolist, coalesced, no_dead_strip", align 8
 // CHECK: @"\01l_OBJC_CLASS_PROTOCOLS_$_A" = private global {{.*}} section "__DATA, __objc_const", align 8
 // CHECK: @"\01l_OBJC_METACLASS_RO_$_A" = private global {{.*}} section "__DATA, __objc_const", align 8
diff --git a/test/CodeGenObjC/metadata_symbols.m b/test/CodeGenObjC/metadata_symbols.m
index 2c44fb59bd2b6..4c365a14f9826 100644
--- a/test/CodeGenObjC/metadata_symbols.m
+++ b/test/CodeGenObjC/metadata_symbols.m
@@ -11,7 +11,7 @@
 // CHECK-X86_64: @"OBJC_CLASS_$_A" = global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64: @"OBJC_METACLASS_$_A" = global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64: @OBJC_CLASS_NAME_ = {{.*}}, section "__TEXT,__objc_classname,cstring_literals", align 1
-// CHECK-X86_64: @"OBJC_EHTYPE_$_EH1" = weak global {{.*}}, section "__DATA,__datacoal_nt,coalesced", align 8
+// CHECK-X86_64: @"OBJC_EHTYPE_$_EH1" = weak global {{.*}}, align 8
 // CHECK-X86_64: @"OBJC_EHTYPE_$_EH2" = external global
 // CHECK-X86_64: @"OBJC_EHTYPE_$_EH3" = global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64: @"OBJC_LABEL_CLASS_$" = private global {{.*}}, section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
@@ -23,7 +23,7 @@
 
 // CHECK-X86_64-HIDDEN: @"OBJC_CLASS_$_A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
 // CHECK-X86_64-HIDDEN: @"OBJC_METACLASS_$_A" = hidden global {{.*}}, section "__DATA, __objc_data", align 8
-// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH1" = weak hidden global {{.*}}, section "__DATA,__datacoal_nt,coalesced"
+// CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH1" = weak hidden global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH2" = external global
 // CHECK-X86_64-HIDDEN: @"OBJC_EHTYPE_$_EH3" = hidden global {{.*}}, section "__DATA,__objc_const", align 8
 // CHECK-X86_64-HIDDEN: define internal void @"\01-[A im0]"
@@ -35,7 +35,7 @@
 // CHECK-ARMV6: @"OBJC_CLASS_$_A" = global {{.*}}, section "__DATA, __objc_data", align 4
 // CHECK-ARMV6: @"OBJC_METACLASS_$_A" = global {{.*}}, section "__DATA, __objc_data", align 4
 // CHECK-ARMV6: @OBJC_CLASS_NAME_ = {{.*}}, section "__TEXT,__objc_classname,cstring_literals", align 1
-// CHECK-ARMV6: @"OBJC_EHTYPE_$_EH1" = weak global {{.*}}, section "__DATA,__datacoal_nt,coalesced", align 4
+// CHECK-ARMV6: @"OBJC_EHTYPE_$_EH1" = weak global {{.*}}, align 4
 // CHECK-ARMV6: @"OBJC_EHTYPE_$_EH2" = external global
 // CHECK-ARMV6: @"OBJC_EHTYPE_$_EH3" = global {{.*}}, section "__DATA,__objc_const", align 4
 // CHECK-ARMV6: @"OBJC_LABEL_CLASS_$" = private global {{.*}}, section "__DATA, __objc_classlist, regular, no_dead_strip", align 4
diff --git a/test/CodeGenObjC/objc2-protocol-metadata.m b/test/CodeGenObjC/objc2-protocol-metadata.m
index 191016be85176..8e186fcee43d8 100644
--- a/test/CodeGenObjC/objc2-protocol-metadata.m
+++ b/test/CodeGenObjC/objc2-protocol-metadata.m
@@ -14,4 +14,4 @@
 + ClsP  { return 0; }
 @end
 
-// CHECK: %struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8* }
+// CHECK: %struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
diff --git a/test/CodeGenObjC/property-atomic-bool.m b/test/CodeGenObjC/property-atomic-bool.m
new file mode 100644
index 0000000000000..77da129f6c04c
--- /dev/null
+++ b/test/CodeGenObjC/property-atomic-bool.m
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10 -emit-llvm -x objective-c %s -o - | FileCheck %s
+
+// CHECK: define internal zeroext i1 @"\01-[A0 p]"(
+// CHECK:   %[[ATOMIC_LOAD:.*]] = load atomic i8, i8* %{{.*}} seq_cst
+// CHECK:   %[[TOBOOL:.*]] = trunc i8 %[[ATOMIC_LOAD]] to i1
+// CHECK:   ret i1 %[[TOBOOL]]
+
+// CHECK: define internal void @"\01-[A0 setP:]"({{.*}} i1 zeroext {{.*}})
+// CHECK:   store atomic i8 %{{.*}}, i8* %{{.*}} seq_cst
+// CHECK:   ret void
+
+// CHECK: define internal zeroext i1 @"\01-[A1 p]"(
+// CHECK:   %[[ATOMIC_LOAD:.*]] = load atomic i8, i8* %{{.*}} unordered
+// CHECK:   %[[TOBOOL:.*]] = trunc i8 %load to i1
+// CHECK:   ret i1 %[[TOBOOL]]
+
+// CHECK: define internal void @"\01-[A1 setP:]"({{.*}} i1 zeroext %p)
+// CHECK:   store atomic i8 %{{.*}}, i8* %{{.*}} unordered
+// CHECK:   ret void
+
+@interface A0
+@property(nonatomic) _Atomic(_Bool) p;
+@end
+@implementation A0
+@end
+
+@interface A1 {
+  _Atomic(_Bool) p;
+}
+@property _Atomic(_Bool) p;
+@end
+@implementation A1
+@synthesize p;
+@end
diff --git a/test/CodeGenObjC/tentative-cfconstantstring.m b/test/CodeGenObjC/tentative-cfconstantstring.m
index 5b3c3bd924942..9ff1a0a2fce2e 100644
--- a/test/CodeGenObjC/tentative-cfconstantstring.m
+++ b/test/CodeGenObjC/tentative-cfconstantstring.m
@@ -32,12 +32,11 @@ static inline void _inlineFunction() {
 @end
 
 // CHECK: @__CFConstantStringClassReference = common global [24 x i32] zeroinitializer, align 16
-// CHECK: @_unnamed_cfstring_{{.*}} = private constant %struct.NSConstantString { i32* getelementptr inbounds ([24 x i32], [24 x i32]* @__CFConstantStringClassReference, i32 0, i32 0)
+// CHECK: @_unnamed_cfstring_{{.*}} = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([24 x i32], [24 x i32]* @__CFConstantStringClassReference, i32 0, i32 0)
 
 // CHECK-LABEL: define internal void @_inlineFunction()
 // CHECK:  [[ZERO:%.*]] = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_
 // CHECK-NEXT:   [[ONE:%.*]] = load i8*, i8** @OBJC_SELECTOR_REFERENCES_
 // CHECK-NEXT:   [[TWO:%.*]] = bitcast %struct._class_t* [[ZERO]] to i8*
-// CHECK-NEXT:   call void (i8*, i8*, [[T:%.*]]*, ...) bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, [[T:%.*]]*, ...)*)(i8* [[TWO]], i8* [[ONE]], [[T:%.*]]* bitcast (%struct.NSConstantString* @_unnamed_cfstring_{{.*}} to [[T:%.*]]*))
+// CHECK-NEXT:   call void (i8*, i8*, [[T:%.*]]*, ...) bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (i8*, i8*, [[T:%.*]]*, ...)*)(i8* [[TWO]], i8* [[ONE]], [[T:%.*]]* bitcast (%struct.__NSConstantString_tag* @_unnamed_cfstring_{{.*}} to [[T:%.*]]*))
 // CHECK-NEXT:   ret void
-
diff --git a/test/CodeGenObjCXX/arc-cxx11-init-list.mm b/test/CodeGenObjCXX/arc-cxx11-init-list.mm
index 230d0f197bfb2..27442f8130448 100644
--- a/test/CodeGenObjCXX/arc-cxx11-init-list.mm
+++ b/test/CodeGenObjCXX/arc-cxx11-init-list.mm
@@ -1,5 +1,11 @@
 // RUN: %clang_cc1 -triple armv7-ios5.0 -std=c++11 -fobjc-arc -Os -emit-llvm -o - %s | FileCheck %s
 
+// CHECK: @[[STR0:.*]] = private unnamed_addr constant [5 x i8] c"str0\00", section "__TEXT,__cstring,cstring_literals"
+// CHECK: @[[UNNAMED_CFSTRING0:.*]] = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR0]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
+// CHECK: @[[STR1:.*]] = private unnamed_addr constant [5 x i8] c"str1\00", section "__TEXT,__cstring,cstring_literals"
+// CHECK: @[[UNNAMED_CFSTRING1:.*]] = private constant %struct.__NSConstantString_tag { i32* getelementptr inbounds ([0 x i32], [0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @[[STR1]], i32 0, i32 0), i32 4 }, section "__DATA,__cfstring"
+// CHECK: @[[REFTMP:.*]] = private constant [2 x i8*] [i8* bitcast (%struct.__NSConstantString_tag* @[[UNNAMED_CFSTRING0]] to i8*), i8* bitcast (%struct.__NSConstantString_tag* @[[UNNAMED_CFSTRING1]] to i8*)]
+
 typedef __SIZE_TYPE__ size_t;
 
 namespace std {
@@ -31,7 +37,17 @@ extern "C" void multiple() { function({ [I new], [I new] }); }
 // CHECK-NEXT: [[CAST:%.*]] = bitcast [{{[0-9]+}} x %0*]* %{{.*}} to i8**
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** [[CAST]],
 // CHECK: call void @objc_release(i8* {{.*}})
-// CHECK-NEXT: icmp eq
+
+std::initializer_list<id> foo1() {
+  return {@"str0", @"str1"};
+}
+
+// CHECK: define void @_Z4foo1v(%"class.std::initializer_list.0"* {{.*}} %[[AGG_RESULT:.*]])
+// CHECK: %[[BEGIN:.*]] = getelementptr inbounds %"class.std::initializer_list.0", %"class.std::initializer_list.0"* %[[AGG_RESULT]], i32 0, i32 0
+// CHECK: store i8** getelementptr inbounds ([2 x i8*], [2 x i8*]* @[[REFTMP]], i32 0, i32 0), i8*** %[[BEGIN]]
+// CHECK: %[[SIZE:.*]] = getelementptr inbounds %"class.std::initializer_list.0", %"class.std::initializer_list.0"* %[[AGG_RESULT]], i32 0, i32 1
+// CHECK: store i32 2, i32* %[[SIZE]]
+// CHECK: ret void
 
 void external();
 
@@ -50,4 +66,3 @@ std::initializer_list<I *> il = { [I new] };
 // CHECK: [[INSTANCE:%.*]] = {{.*}} call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* {{.*}}, i8* {{.*}})
 // CHECK-NEXT: store i8* [[INSTANCE]], i8** bitcast ([1 x %0*]* @_ZGR2il_ to i8**)
 // CHECK: {{.*}} call void @objc_autoreleasePoolPop(i8* [[POOL]])
-
diff --git a/test/CodeGenObjCXX/arc-mangle.mm b/test/CodeGenObjCXX/arc-mangle.mm
index a168d41b336c8..82e37556ec309 100644
--- a/test/CodeGenObjCXX/arc-mangle.mm
+++ b/test/CodeGenObjCXX/arc-mangle.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fobjc-arc -fobjc-runtime-has-weak -triple %itanium_abi_triple -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fobjc-runtime-has-weak -triple %itanium_abi_triple -emit-llvm -fblocks -o - %s | FileCheck %s
 
 // CHECK-LABEL: define {{.*}}void @_Z1fPU8__strongP11objc_object(i8**)
 void f(__strong id *) {}
@@ -8,15 +8,24 @@ void f(__weak id *) {}
 void f(__autoreleasing id *) {}
 // CHECK-LABEL: define {{.*}}void @_Z1fPP11objc_object(i8**)
 void f(__unsafe_unretained id *) {}
-// CHECK-LABEL: define {{.*}}void @_Z1fPKU8__strongP11objc_object(i8**)
+// CHECK-LABEL: define {{.*}}void @_Z1fPU8__strongKP11objc_object(i8**)
 void f(const __strong id *) {}
-// CHECK-LABEL: define {{.*}}void @_Z1fPKU6__weakP11objc_object(i8**)
+// CHECK-LABEL: define {{.*}}void @_Z1fPU6__weakKP11objc_object(i8**)
 void f(const __weak id *) {}
-// CHECK-LABEL: define {{.*}}void @_Z1fPKU15__autoreleasingP11objc_object(i8**)
+// CHECK-LABEL: define {{.*}}void @_Z1fPU15__autoreleasingKP11objc_object(i8**)
 void f(const __autoreleasing id *) {}
 // CHECK-LABEL: define {{.*}}void @_Z1fPKP11objc_object(i8**)
 void f(const __unsafe_unretained id *) {}
-
+// CHECK-LABEL: define {{.*}}void @_Z1fPFU19ns_returns_retainedP11objc_objectvE
+void f(__attribute__((ns_returns_retained)) id (*fn)()) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fP11objc_object
+void f(__attribute__((ns_consumed)) id) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectU11ns_consumedS0_S0_E
+void f(id (*fn)(__attribute__((ns_consumed)) id, id)) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectS0_U11ns_consumedS0_E
+void f(__strong id (*fn)(id, __attribute__((ns_consumed)) id)) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fU13block_pointerFvU11ns_consumedP11objc_objectE
+void f(void (^)(__attribute__((ns_consumed)) id)) {}
 
 template<unsigned N> struct unsigned_c { };
 
diff --git a/test/CodeGenObjCXX/arc-move.mm b/test/CodeGenObjCXX/arc-move.mm
index 76fb15b290d59..d1710e291b090 100644
--- a/test/CodeGenObjCXX/arc-move.mm
+++ b/test/CodeGenObjCXX/arc-move.mm
@@ -72,10 +72,10 @@ void library_move(__strong id &y) {
   // CHECK-NEXT: ret void
 }
 
-// CHECK-LABEL: define void @_Z10const_moveRKU8__strongP11objc_object(
+// CHECK-LABEL: define void @_Z10const_moveRU8__strongKP11objc_object(
 void const_move(const __strong id &x) {
   // CHECK:      [[Y:%.*]] = alloca i8*,
-  // CHECK:      [[X:%.*]] = call dereferenceable({{[0-9]+}}) i8** @_Z4moveIRKU8__strongP11objc_objectEON16remove_referenceIT_E4typeEOS5_(
+  // CHECK:      [[X:%.*]] = call dereferenceable({{[0-9]+}}) i8** @_Z4moveIRU8__strongKP11objc_objectEON16remove_referenceIT_E4typeEOS5_(
   // CHECK-NEXT: [[T0:%.*]] = load i8*, i8** [[X]]
   // CHECK-NEXT: [[T1:%.*]] = call i8* @objc_retain(i8* [[T0]])
   // CHECK-NEXT: store i8* [[T1]], i8** [[Y]]
diff --git a/test/CodeGenObjCXX/arc-new-delete.mm b/test/CodeGenObjCXX/arc-new-delete.mm
index f853ea4366a08..141f401957824 100644
--- a/test/CodeGenObjCXX/arc-new-delete.mm
+++ b/test/CodeGenObjCXX/arc-new-delete.mm
@@ -12,32 +12,32 @@ void test_new(id invalue) {
   // OPT-NEXT: [[T0:%.*]] = call i8* @objc_retain(i8* [[INVALUE:%.*]])
   // OPT-NEXT: store i8* [[T0]], i8** [[INVALUEADDR]]
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // CHECK-NEXT: store i8* null, i8**
   new strong_id;
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // UNOPT-NEXT: store i8* null, i8**
   // OPT-NEXT: call i8* @objc_initWeak(i8** {{.*}}, i8* null)
   new weak_id;
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // CHECK-NEXT: store i8* null, i8**
   new __strong id;
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK-NEXT: {{bitcast i8\*.*to i8\*\*}}
   // UNOPT-NEXT: store i8* null, i8**
   // OPT-NEXT: call i8* @objc_initWeak(i8** {{.*}}, i8* null)
   new __weak id;
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call i8* @objc_retain
   // CHECK: store i8*
   new __strong id(invalue);
 
-  // CHECK: call noalias i8* @_Znwm
+  // CHECK: call i8* @_Znwm
   // CHECK: call i8* @objc_initWeak
   new __weak id(invalue);
 
@@ -48,12 +48,12 @@ void test_new(id invalue) {
 
 // CHECK-LABEL: define void @_Z14test_array_new
 void test_array_new() {
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: store i64 17, i64*
   // CHECK: call void @llvm.memset.p0i8.i64
   new strong_id[17];
 
-  // CHECK: call noalias i8* @_Znam
+  // CHECK: call i8* @_Znam
   // CHECK: store i64 17, i64*
   // CHECK: call void @llvm.memset.p0i8.i64
   new weak_id[17];
diff --git a/test/CodeGenObjCXX/auto-release-result-assert.mm b/test/CodeGenObjCXX/auto-release-result-assert.mm
new file mode 100644
index 0000000000000..044dc9d7d99c2
--- /dev/null
+++ b/test/CodeGenObjCXX/auto-release-result-assert.mm
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -emit-llvm -fblocks -fobjc-arc -o - %s | FileCheck %s
+
+// CHECK-LABEL: define %struct.S1* @_Z4foo1i(
+// CHECK: %[[CALL:[a-z0-9]+]] = call %struct.S1* @_Z4foo0i
+// CHECK: ret %struct.S1* %[[CALL]]
+
+// CHECK-LABEL: define %struct.S1* @_ZN2S22m1Ev(
+// CHECK: %[[CALL:[a-z0-9]+]] = call %struct.S1* @_Z4foo0i
+// CHECK: ret %struct.S1* %[[CALL]]
+
+// CHECK-LABEL: define internal %struct.S1* @Block1_block_invoke(
+// CHECK: %[[CALL:[a-z0-9]+]] = call %struct.S1* @_Z4foo0i
+// CHECK: ret %struct.S1* %[[CALL]]
+
+struct S1;
+
+typedef __attribute__((NSObject)) struct __attribute__((objc_bridge(id))) S1 * S1Ref;
+
+S1Ref foo0(int);
+
+struct S2 {
+  S1Ref m1();
+};
+
+S1Ref foo1(int a) {
+  return foo0(a);
+}
+
+S1Ref S2::m1() {
+  return foo0(0);
+}
+
+S1Ref (^Block1)(void) = ^{
+  return foo0(0);
+};
diff --git a/test/CodeGenObjCXX/block-default-arg.mm b/test/CodeGenObjCXX/block-default-arg.mm
new file mode 100644
index 0000000000000..167b31d405494
--- /dev/null
+++ b/test/CodeGenObjCXX/block-default-arg.mm
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10.0.0 -emit-llvm -o - %s -std=c++11 -fblocks -fobjc-arc | FileCheck  %s
+
+// CHECK: define internal void @___Z16test_default_argi_block_invoke(i8* %[[BLOCK_DESCRIPTOR:.*]])
+// CHECK: %[[BLOCK:.*]] = bitcast i8* %[[BLOCK_DESCRIPTOR]] to <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>*
+// CHECK: %[[BLOCK_CAPTURE_ADDR:.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32 }>* %[[BLOCK]], i32 0, i32 5
+// CHECK: %[[V0:.*]] = load i32, i32* %[[BLOCK_CAPTURE_ADDR]]
+// CHECK: call void @_Z4foo1i(i32 %[[V0]])
+
+void foo1(int);
+
+void test_default_arg(const int a = 42) {
+  auto block = ^{
+    foo1(a);
+  };
+  block();
+}
diff --git a/test/CodeGenObjCXX/block-nested-in-lambda.cpp b/test/CodeGenObjCXX/block-nested-in-lambda.cpp
new file mode 100644
index 0000000000000..51b7abf034e5b
--- /dev/null
+++ b/test/CodeGenObjCXX/block-nested-in-lambda.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple=x86_64-apple-darwin10 -emit-llvm -std=c++11 -fblocks -o - %s | FileCheck %s
+
+// CHECK: %[[BLOCK_CAPTURED0:.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32*, i32* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32*, i32* }>* %[[BLOCK:.*]], i32 0, i32 5
+// CHECK: %[[V0:.*]] = getelementptr inbounds %[[LAMBDA_CLASS:.*]], %[[LAMBDA_CLASS]]* %[[THIS:.*]], i32 0, i32 0
+// CHECK: %[[V1:.*]] = load i32*, i32** %[[V0]], align 8
+// CHECK: store i32* %[[V1]], i32** %[[BLOCK_CAPTURED0]], align 8
+// CHECK: %[[BLOCK_CAPTURED1:.*]] = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32*, i32* }>, <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32*, i32* }>* %[[BLOCK]], i32 0, i32 6
+// CHECK: %[[V2:.*]] = getelementptr inbounds %[[LAMBDA_CLASS]], %[[LAMBDA_CLASS]]* %[[THIS]], i32 0, i32 1
+// CHECK: %[[V3:.*]] = load i32*, i32** %[[V2]], align 8
+// CHECK: store i32* %[[V3]], i32** %[[BLOCK_CAPTURED1]], align 8
+
+void foo1(int &, int &);
+
+void block_in_lambda(int &s1, int &s2) {
+  auto lambda = [&s1, &s2]() {
+    auto block = ^{
+      foo1(s1, s2);
+    };
+    block();
+  };
+
+  lambda();
+}
diff --git a/test/CodeGenObjCXX/copy.mm b/test/CodeGenObjCXX/copy.mm
index 9e41bf026a767..7783137dc0912 100644
--- a/test/CodeGenObjCXX/copy.mm
+++ b/test/CodeGenObjCXX/copy.mm
@@ -11,7 +11,7 @@ namespace test0 {
   // CHECK:      alloca
   // CHECK-NEXT: getelementptr
   // CHECK-NEXT: store
-  // CHECK-NEXT: call noalias i8* @_Znwm(
+  // CHECK-NEXT: call i8* @_Znwm(
   // CHECK-NEXT: bitcast
   // CHECK-NEXT: bitcast
   // CHECK-NEXT: bitcast
diff --git a/test/CodeGenObjCXX/debug-info-block-capture-this.mm b/test/CodeGenObjCXX/debug-info-block-capture-this.mm
new file mode 100644
index 0000000000000..9464222b543b5
--- /dev/null
+++ b/test/CodeGenObjCXX/debug-info-block-capture-this.mm
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -std=c++14 -fblocks -debug-info-kind=standalone -emit-llvm %s -o - | FileCheck %s
+struct test
+{
+    int func() { return 1; }
+    int (^block)() = ^{ return func(); };
+};
+
+int main(int argc, const char * argv[]) {
+    test t;
+    return t.block();
+}
+
+// CHECK: ![[TESTCT:[0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "test"
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "__block_literal_1",
+// CHECK-SAME:             elements: ![[ELEMS:.*]])
+// CHECK: ![[ELEMS]] = !{{{.*}}, ![[THIS:[0-9]+]]}
+// CHECK: ![[THIS]] = !DIDerivedType(tag: DW_TAG_member, name: "this",
+// CHECK-SAME:                       baseType: ![[TESTCT]],
+
+
diff --git a/test/CodeGenObjCXX/debug-info-cyclic.mm b/test/CodeGenObjCXX/debug-info-cyclic.mm
index 582ca445f9938..13dd5ab6e1007 100644
--- a/test/CodeGenObjCXX/debug-info-cyclic.mm
+++ b/test/CodeGenObjCXX/debug-info-cyclic.mm
@@ -1,13 +1,13 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin -debug-info-kind=standalone -emit-llvm %s -o - | FileCheck %s
 
 struct B {
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "B"
+// CHECK: ![[B:[0-9]+]] ={{.*}}!DICompositeType(tag: DW_TAG_structure_type, name: "B"
 // CHECK-SAME:                             line: [[@LINE-2]],
 // CHECK-SAME:                             size: 8, align: 8,
 // CHECK-NOT:                              offset:
 // CHECK-NOT:                              DIFlagFwdDecl
 // CHECK-SAME:                             elements: ![[BMEMBERS:[0-9]+]]
-// CHECK-SAME:                             identifier: [[B:.*]])
+// CHECK-SAME:                             identifier:
 // CHECK: ![[BMEMBERS]] = !{![[BB:[0-9]+]]}
   B(struct A *);
 // CHECK: ![[BB]] = !DISubprogram(name: "B", scope: ![[B]]
diff --git a/test/CodeGenObjCXX/mangle.mm b/test/CodeGenObjCXX/mangle.mm
index bcb920ba1e918..2854cffc946bc 100644
--- a/test/CodeGenObjCXX/mangle.mm
+++ b/test/CodeGenObjCXX/mangle.mm
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -std=c++11 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple=x86_64-apple-darwin10 -std=c++11 -emit-llvm -fblocks -o - | FileCheck %s
 
 // CHECK: @"_ZZ11+[A shared]E1a" = internal global
 // CHECK: @"_ZZ11-[A(Foo) f]E1a" = internal global
@@ -113,3 +113,10 @@ void parameterized_test2(__kindof Parameterized<A *, Test *> *p) {}
 
 // CHECK-LABEL: define void @_Z19parameterized_test3P13Parameterized
 void parameterized_test3(Parameterized *p) {}
+
+// CHECK-LABEL: define {{.*}}void @_Z1fP11objc_object
+void f(__attribute__((ns_consumed)) id) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fPFP11objc_objectS0_S0_E
+void f(id (*fn)(__attribute__((ns_consumed)) id, id)) {}
+// CHECK-LABEL: define {{.*}}void @_Z1fU13block_pointerFvP11objc_objectE
+void f(void (^)(__attribute__((ns_consumed)) id)) {}
diff --git a/test/CodeGenObjCXX/personality-abuse.mm b/test/CodeGenObjCXX/personality-abuse.mm
index f5170bf75c090..2a3620d5994f9 100644
--- a/test/CodeGenObjCXX/personality-abuse.mm
+++ b/test/CodeGenObjCXX/personality-abuse.mm
@@ -16,4 +16,4 @@ void foo() {
   }
 }
 
-// CHECK: define void @_Z3foov() #1 personality i8* bitcast (i32 ()* @__objc_personality_v0 to i8*)
+// CHECK: define void @_Z3foov() {{#[0-9]+}} personality i8* bitcast (i32 ()* @__objc_personality_v0 to i8*)
diff --git a/test/CodeGenOpenCL/address-spaces-conversions.cl b/test/CodeGenOpenCL/address-spaces-conversions.cl
index bc80f47f5f42c..c947db41e070e 100644
--- a/test/CodeGenOpenCL/address-spaces-conversions.cl
+++ b/test/CodeGenOpenCL/address-spaces-conversions.cl
@@ -1,22 +1,89 @@
 // RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -ffake-address-space-map -cl-std=CL2.0 -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -cl-std=CL2.0 -emit-llvm -o - | FileCheck --check-prefix=CHECK-NOFAKE %s
+// When -ffake-address-space-map is not used, all addr space mapped to 0 for x86_64.
 
 // test that we generate address space casts everywhere we need conversions of
 // pointers to different address spaces
 
+// CHECK: define void @test
 void test(global int *arg_glob, generic int *arg_gen) {
   int var_priv;
   arg_gen = arg_glob; // implicit cast global -> generic
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(4)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   arg_gen = &var_priv; // implicit cast with obtaining adr, private -> generic
   // CHECK: %{{[0-9]+}} = addrspacecast i32* %var_priv to i32 addrspace(4)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   arg_glob = (global int *)arg_gen; // explicit cast
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(4)* %{{[0-9]+}} to i32 addrspace(1)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   global int *var_glob =
       (global int *)arg_glob; // explicit cast in the same address space
   // CHECK-NOT: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(1)*
+  // CHECK-NOFAKE-NOT: addrspacecast
+
   var_priv = arg_gen - arg_glob; // arithmetic operation
   // CHECK: %{{.*}} = ptrtoint i32 addrspace(4)* %{{.*}} to i64
   // CHECK: %{{.*}} = ptrtoint i32 addrspace(1)* %{{.*}} to i64
+  // CHECK-NOFAKE: %{{.*}} = ptrtoint i32* %{{.*}} to i64
+  // CHECK-NOFAKE: %{{.*}} = ptrtoint i32* %{{.*}} to i64
+
   var_priv = arg_gen > arg_glob; // comparison
   // CHECK: %{{[0-9]+}} = addrspacecast i32 addrspace(1)* %{{[0-9]+}} to i32 addrspace(4)*
+
+  generic void *var_gen_v = arg_glob;
+  // CHECK: addrspacecast
+  // CHECK-NOT: bitcast
+  // CHECK-NOFAKE: bitcast
+  // CHECK-NOFAKE-NOT: addrspacecast
+}
+
+// Test ternary operator.
+// CHECK: define void @test_ternary
+void test_ternary(void) {
+  global int *var_glob;
+  generic int *var_gen;
+  generic int *var_gen2;
+  generic float *var_gen_f;
+  generic void *var_gen_v;
+
+  var_gen = var_gen ? var_gen : var_gen2; // operands of the same addr spaces and the same type
+  // CHECK: icmp
+  // CHECK-NOT: addrspacecast
+  // CHECK-NOT: bitcast
+  // CHECK: phi
+  // CHECK: store i32 addrspace(4)* %{{.+}}, i32 addrspace(4)** %{{.+}}
+
+  var_gen = var_gen ? var_gen : var_glob; // operands of overlapping addr spaces and the same type
+  // CHECK: icmp
+  // CHECK-NOT: bitcast
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i32 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+
+  typedef int int_t;
+  global int_t *var_glob_typedef;
+  var_gen = var_gen ? var_gen : var_glob_typedef; // operands of overlapping addr spaces and equivalent types
+  // CHECK: icmp
+  // CHECK-NOT: bitcast
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i32 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+ 
+  var_gen_v = var_gen ? var_gen : var_gen_f; // operands of the same addr space and different types
+  // CHECK: icmp
+  // CHECK: %{{.+}} = bitcast i32 addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: %{{.+}} = bitcast float addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
+
+  var_gen_v = var_gen ? var_glob : var_gen_f; // operands of overlapping addr spaces and different types
+  // CHECK: icmp
+  // CHECK: %{{.+}} = addrspacecast i32 addrspace(1)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: %{{.+}} = bitcast float addrspace(4)* %{{.+}} to i8 addrspace(4)*
+  // CHECK: phi
+  // CHECK: store
 }
diff --git a/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl b/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
new file mode 100644
index 0000000000000..3a98e9099f522
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgcn-flat-scratch-name.cl
@@ -0,0 +1,15 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @use_flat_scratch_name
+kernel void use_flat_scratch_name()
+{
+// CHECK: tail call void asm sideeffect "s_mov_b64 flat_scratch, 0", "~{flat_scratch}"()
+  __asm__ volatile("s_mov_b64 flat_scratch, 0" : : : "flat_scratch");
+
+// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_lo, 0", "~{flat_scratch_lo}"()
+  __asm__ volatile("s_mov_b32 flat_scratch_lo, 0" : : : "flat_scratch_lo");
+
+// CHECK: tail call void asm sideeffect "s_mov_b32 flat_scratch_hi, 0", "~{flat_scratch_hi}"()
+  __asm__ volatile("s_mov_b32 flat_scratch_hi, 0" : : : "flat_scratch_hi");
+}
diff --git a/test/CodeGenOpenCL/amdgpu-call-kernel.cl b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
new file mode 100755
index 0000000000000..005793916c68a
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgpu-call-kernel.cl
@@ -0,0 +1,14 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// CHECK: define amdgpu_kernel void @test_call_kernel(i32 addrspace(1)* nocapture %out)
+// CHECK: store i32 4, i32 addrspace(1)* %out, align 4
+
+kernel void test_kernel(global int *out)
+{
+  out[0] = 4;
+}
+
+__kernel void test_call_kernel(__global int *out)
+{
+  test_kernel(out);
+}
diff --git a/test/CodeGenOpenCL/amdgpu-calling-conv.cl b/test/CodeGenOpenCL/amdgpu-calling-conv.cl
new file mode 100644
index 0000000000000..7da9d7f4d49a2
--- /dev/null
+++ b/test/CodeGenOpenCL/amdgpu-calling-conv.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK: define amdgpu_kernel void @calling_conv_amdgpu_kernel()
+kernel void calling_conv_amdgpu_kernel()
+{
+}
+
+// CHECK: define void @calling_conv_none()
+void calling_conv_none()
+{
+}
diff --git a/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl b/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
index 35bdcead31280..589d00d1eaa49 100644
--- a/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
+++ b/test/CodeGenOpenCL/amdgpu-num-gpr-attr.cl
@@ -5,23 +5,23 @@
 
 __attribute__((amdgpu_num_vgpr(64))) // expected-no-diagnostics
 kernel void test_num_vgpr64() {
-// CHECK: define void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_vgpr64() [[ATTR_VGPR64:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_sgpr(32))) // expected-no-diagnostics
 kernel void test_num_sgpr32() {
-// CHECK: define void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_sgpr32() [[ATTR_SGPR32:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_vgpr(64), amdgpu_num_sgpr(32))) // expected-no-diagnostics
 kernel void test_num_vgpr64_sgpr32() {
-// CHECK: define void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_vgpr64_sgpr32() [[ATTR_VGPR64_SGPR32:#[0-9]+]]
 
 }
 
 __attribute__((amdgpu_num_sgpr(20), amdgpu_num_vgpr(40))) // expected-no-diagnostics
 kernel void test_num_sgpr20_vgpr40() {
-// CHECK: define void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]
+// CHECK: define amdgpu_kernel void @test_num_sgpr20_vgpr40() [[ATTR_SGPR20_VGPR40:#[0-9]+]]
 }
 
 __attribute__((amdgpu_num_vgpr(0))) // expected-no-diagnostics
@@ -40,8 +40,8 @@ kernel void test_num_vgpr0_sgpr0() {
 // X86-NOT: "amdgpu_num_vgpr"
 // X86-NOT: "amdgpu_num_sgpr"
 
-// CHECK-DAG-NOT: "amdgpu_num_vgpr"="0"
-// CHECK-DAG-NOT: "amdgpu_num_sgpr"="0"
+// CHECK-NOT: "amdgpu_num_vgpr"="0"
+// CHECK-NOT: "amdgpu_num_sgpr"="0"
 // CHECK-DAG: attributes [[ATTR_VGPR64]] = { nounwind "amdgpu_num_vgpr"="64"
 // CHECK-DAG: attributes [[ATTR_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32"
 // CHECK-DAG: attributes [[ATTR_VGPR64_SGPR32]] = { nounwind "amdgpu_num_sgpr"="32" "amdgpu_num_vgpr"="64"
diff --git a/test/CodeGenOpenCL/as_type.cl b/test/CodeGenOpenCL/as_type.cl
new file mode 100644
index 0000000000000..7fc3b02bdc757
--- /dev/null
+++ b/test/CodeGenOpenCL/as_type.cl
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple spir-unknown-unknown -o - | FileCheck %s
+
+typedef __attribute__(( ext_vector_type(3) )) char char3;
+typedef __attribute__(( ext_vector_type(4) )) char char4;
+typedef __attribute__(( ext_vector_type(16) )) char char16;
+typedef __attribute__(( ext_vector_type(3) )) int int3;
+
+//CHECK: define spir_func <3 x i8> @f1(<4 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = shufflevector <4 x i8> %[[x]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i8> %[[astype]]
+char3 f1(char4 x) {
+  return  __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <4 x i8> @f2(<3 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = shufflevector <3 x i8> %[[x]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+//CHECK: ret <4 x i8> %[[astype]]
+char4 f2(char3 x) {
+  return __builtin_astype(x, char4);
+}
+
+//CHECK: define spir_func <3 x i8> @f3(i32 %[[x:.*]])
+//CHECK: %[[cast:.*]] = bitcast i32 %[[x]] to <4 x i8>
+//CHECK: %[[astype:.*]] = shufflevector <4 x i8> %[[cast]], <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i8> %[[astype]]
+char3 f3(int x) {
+  return __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <4 x i8> @f4(i32 %[[x:.*]])
+//CHECK: %[[astype:.*]] = bitcast i32 %[[x]] to <4 x i8>
+//CHECK-NOT: shufflevector
+//CHECK: ret <4 x i8> %[[astype]]
+char4 f4(int x) {
+  return __builtin_astype(x, char4);
+}
+
+//CHECK: define spir_func i32 @f5(<3 x i8> %[[x:.*]])
+//CHECK: %[[shuffle:.*]] = shufflevector <3 x i8> %[[x]], <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+//CHECK: %[[astype:.*]] = bitcast <4 x i8> %[[shuffle]] to i32
+//CHECK: ret i32 %[[astype]]
+int f5(char3 x) {
+  return __builtin_astype(x, int);
+}
+
+//CHECK: define spir_func i32 @f6(<4 x i8> %[[x:.*]])
+//CHECK: %[[astype:.*]] = bitcast <4 x i8> %[[x]] to i32
+//CHECK-NOT: shufflevector
+//CHECK: ret i32 %[[astype]]
+int f6(char4 x) {
+  return __builtin_astype(x, int);
+}
+
+//CHECK: define spir_func <3 x i8> @f7(<3 x i8> %[[x:.*]])
+//CHECK-NOT: bitcast
+//CHECK-NOT: shufflevector
+//CHECK: ret <3 x i8> %[[x]]
+char3 f7(char3 x) {
+  return __builtin_astype(x, char3);
+}
+
+//CHECK: define spir_func <3 x i32> @f8(<16 x i8> %[[x:.*]])
+//CHECK: %[[cast:.*]] = bitcast <16 x i8> %[[x]] to <4 x i32>
+//CHECK: %[[astype:.*]] = shufflevector <4 x i32> %[[cast]], <4 x i32> undef, <3 x i32> <i32 0, i32 1, i32 2>
+//CHECK: ret <3 x i32> %[[astype]]
+int3 f8(char16 x) {
+  return __builtin_astype(x, int3);
+}
diff --git a/test/CodeGenOpenCL/builtins-amdgcn-error.cl b/test/CodeGenOpenCL/builtins-amdgcn-error.cl
new file mode 100644
index 0000000000000..89c3e490ecdd8
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn-error.cl
@@ -0,0 +1,18 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -target-cpu tahiti -verify -S -o - %s
+
+// FIXME: We only get one error if the functions are the other order in the
+// file.
+
+typedef unsigned long ulong;
+
+ulong test_s_memrealtime()
+{
+  return __builtin_amdgcn_s_memrealtime(); // expected-error {{'__builtin_amdgcn_s_memrealtime' needs target feature s-memrealtime}}
+}
+
+void test_s_sleep(int x)
+{
+  __builtin_amdgcn_s_sleep(x); // expected-error {{argument to '__builtin_amdgcn_s_sleep' must be a constant integer}}
+}
+
diff --git a/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
new file mode 100644
index 0000000000000..cda87a8e1ed17
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu tonga -S -emit-llvm -o - %s | FileCheck %s
+
+typedef unsigned long ulong;
+
+
+// CHECK-LABEL: @test_s_memrealtime
+// CHECK: call i64 @llvm.amdgcn.s.memrealtime()
+void test_s_memrealtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memrealtime();
+}
diff --git a/test/CodeGenOpenCL/builtins-amdgcn.cl b/test/CodeGenOpenCL/builtins-amdgcn.cl
new file mode 100644
index 0000000000000..6cac2a44bc79a
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -0,0 +1,316 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+typedef unsigned long ulong;
+
+// CHECK-LABEL: @test_div_scale_f64
+// CHECK: call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true)
+// CHECK-DAG: [[FLAG:%.+]] = extractvalue { double, i1 } %{{.+}}, 1
+// CHECK-DAG: [[VAL:%.+]] = extractvalue { double, i1 } %{{.+}}, 0
+// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
+// CHECK: store i32 [[FLAGEXT]]
+void test_div_scale_f64(global double* out, global int* flagout, double a, double b)
+{
+  bool flag;
+  *out = __builtin_amdgcn_div_scale(a, b, true, &flag);
+  *flagout = flag;
+}
+
+// CHECK-LABEL: @test_div_scale_f32
+// CHECK: call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true)
+// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1
+// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0
+// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
+// CHECK: store i32 [[FLAGEXT]]
+void test_div_scale_f32(global float* out, global int* flagout, float a, float b)
+{
+  bool flag;
+  *out = __builtin_amdgcn_div_scalef(a, b, true, &flag);
+  *flagout = flag;
+}
+
+// CHECK-LABEL: @test_div_fmas_f32
+// CHECK: call float @llvm.amdgcn.div.fmas.f32
+void test_div_fmas_f32(global float* out, float a, float b, float c, int d)
+{
+  *out = __builtin_amdgcn_div_fmasf(a, b, c, d);
+}
+
+// CHECK-LABEL: @test_div_fmas_f64
+// CHECK: call double @llvm.amdgcn.div.fmas.f64
+void test_div_fmas_f64(global double* out, double a, double b, double c, int d)
+{
+  *out = __builtin_amdgcn_div_fmas(a, b, c, d);
+}
+
+// CHECK-LABEL: @test_div_fixup_f32
+// CHECK: call float @llvm.amdgcn.div.fixup.f32
+void test_div_fixup_f32(global float* out, float a, float b, float c)
+{
+  *out = __builtin_amdgcn_div_fixupf(a, b, c);
+}
+
+// CHECK-LABEL: @test_div_fixup_f64
+// CHECK: call double @llvm.amdgcn.div.fixup.f64
+void test_div_fixup_f64(global double* out, double a, double b, double c)
+{
+  *out = __builtin_amdgcn_div_fixup(a, b, c);
+}
+
+// CHECK-LABEL: @test_trig_preop_f32
+// CHECK: call float @llvm.amdgcn.trig.preop.f32
+void test_trig_preop_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_trig_preopf(a, b);
+}
+
+// CHECK-LABEL: @test_trig_preop_f64
+// CHECK: call double @llvm.amdgcn.trig.preop.f64
+void test_trig_preop_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_trig_preop(a, b);
+}
+
+// CHECK-LABEL: @test_rcp_f32
+// CHECK: call float @llvm.amdgcn.rcp.f32
+void test_rcp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rcpf(a);
+}
+
+// CHECK-LABEL: @test_rcp_f64
+// CHECK: call double @llvm.amdgcn.rcp.f64
+void test_rcp_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rcp(a);
+}
+
+// CHECK-LABEL: @test_rsq_f32
+// CHECK: call float @llvm.amdgcn.rsq.f32
+void test_rsq_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rsqf(a);
+}
+
+// CHECK-LABEL: @test_rsq_f64
+// CHECK: call double @llvm.amdgcn.rsq.f64
+void test_rsq_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rsq(a);
+}
+
+// CHECK-LABEL: @test_rsq_clamp_f32
+// CHECK: call float @llvm.amdgcn.rsq.clamp.f32
+void test_rsq_clamp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_rsq_clampf(a);
+}
+
+// CHECK-LABEL: @test_rsq_clamp_f64
+// CHECK: call double @llvm.amdgcn.rsq.clamp.f64
+void test_rsq_clamp_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_rsq_clamp(a);
+}
+
+// CHECK-LABEL: @test_sin_f32
+// CHECK: call float @llvm.amdgcn.sin.f32
+void test_sin_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_sinf(a);
+}
+
+// CHECK-LABEL: @test_cos_f32
+// CHECK: call float @llvm.amdgcn.cos.f32
+void test_cos_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_cosf(a);
+}
+
+// CHECK-LABEL: @test_log_clamp_f32
+// CHECK: call float @llvm.amdgcn.log.clamp.f32
+void test_log_clamp_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_log_clampf(a);
+}
+
+// CHECK-LABEL: @test_ldexp_f32
+// CHECK: call float @llvm.amdgcn.ldexp.f32
+void test_ldexp_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_ldexpf(a, b);
+}
+
+// CHECK-LABEL: @test_ldexp_f64
+// CHECK: call double @llvm.amdgcn.ldexp.f64
+void test_ldexp_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_ldexp(a, b);
+}
+
+// CHECK-LABEL: @test_frexp_mant_f32
+// CHECK: call float @llvm.amdgcn.frexp.mant.f32
+void test_frexp_mant_f32(global float* out, float a)
+{
+  *out = __builtin_amdgcn_frexp_mantf(a);
+}
+
+// CHECK-LABEL: @test_frexp_mant_f64
+// CHECK: call double @llvm.amdgcn.frexp.mant.f64
+void test_frexp_mant_f64(global double* out, double a)
+{
+  *out = __builtin_amdgcn_frexp_mant(a);
+}
+
+// CHECK-LABEL: @test_frexp_exp_f32
+// CHECK: call i32 @llvm.amdgcn.frexp.exp.f32
+void test_frexp_exp_f32(global int* out, float a)
+{
+  *out = __builtin_amdgcn_frexp_expf(a);
+}
+
+// CHECK-LABEL: @test_frexp_exp_f64
+// CHECK: call i32 @llvm.amdgcn.frexp.exp.f64
+void test_frexp_exp_f64(global int* out, double a)
+{
+  *out = __builtin_amdgcn_frexp_exp(a);
+}
+
+// CHECK-LABEL: @test_fract_f32
+// CHECK: call float @llvm.amdgcn.fract.f32
+void test_fract_f32(global int* out, float a)
+{
+  *out = __builtin_amdgcn_fractf(a);
+}
+
+// CHECK-LABEL: @test_fract_f64
+// CHECK: call double @llvm.amdgcn.fract.f64
+void test_fract_f64(global int* out, double a)
+{
+  *out = __builtin_amdgcn_fract(a);
+}
+
+// CHECK-LABEL: @test_lerp
+// CHECK: call i32 @llvm.amdgcn.lerp
+void test_lerp(global int* out, int a, int b, int c)
+{
+  *out = __builtin_amdgcn_lerp(a, b, c);
+}
+
+// CHECK-LABEL: @test_class_f32
+// CHECK: call i1 @llvm.amdgcn.class.f32
+void test_class_f32(global float* out, float a, int b)
+{
+  *out = __builtin_amdgcn_classf(a, b);
+}
+
+// CHECK-LABEL: @test_class_f64
+// CHECK: call i1 @llvm.amdgcn.class.f64
+void test_class_f64(global double* out, double a, int b)
+{
+  *out = __builtin_amdgcn_class(a, b);
+}
+
+// CHECK-LABEL: @test_s_barrier
+// CHECK: call void @llvm.amdgcn.s.barrier(
+void test_s_barrier()
+{
+  __builtin_amdgcn_s_barrier();
+}
+
+// CHECK-LABEL: @test_s_memtime
+// CHECK: call i64 @llvm.amdgcn.s.memtime()
+void test_s_memtime(global ulong* out)
+{
+  *out = __builtin_amdgcn_s_memtime();
+}
+
+// CHECK-LABEL: @test_s_sleep
+// CHECK: call void @llvm.amdgcn.s.sleep(i32 1)
+// CHECK: call void @llvm.amdgcn.s.sleep(i32 15)
+void test_s_sleep()
+{
+  __builtin_amdgcn_s_sleep(1);
+  __builtin_amdgcn_s_sleep(15);
+}
+
+// CHECK-LABEL: @test_cubeid(
+// CHECK: call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
+void test_cubeid(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubeid(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubesc(
+// CHECK: call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
+void test_cubesc(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubesc(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubetc(
+// CHECK: call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
+void test_cubetc(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubetc(a, b, c);
+}
+
+// CHECK-LABEL: @test_cubema(
+// CHECK: call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
+void test_cubema(global float* out, float a, float b, float c) {
+  *out = __builtin_amdgcn_cubema(a, b, c);
+}
+
+// CHECK-LABEL: @test_read_exec(
+// CHECK: call i64 @llvm.read_register.i64(metadata ![[EXEC:[0-9]+]]) #[[READ_EXEC_ATTRS:[0-9]+]]
+void test_read_exec(global ulong* out) {
+  *out = __builtin_amdgcn_read_exec();
+}
+
+// CHECK: declare i64 @llvm.read_register.i64(metadata) #[[NOUNWIND_READONLY:[0-9]+]]
+
+// CHECK-LABEL: @test_kernarg_segment_ptr
+// CHECK: call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
+void test_kernarg_segment_ptr(__attribute__((address_space(2))) unsigned char ** out)
+{
+  *out = __builtin_amdgcn_kernarg_segment_ptr();
+}
+
+// CHECK-LABEL: @test_implicitarg_ptr
+// CHECK: call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
+void test_implicitarg_ptr(__attribute__((address_space(2))) unsigned char ** out)
+{
+  *out = __builtin_amdgcn_implicitarg_ptr();
+}
+
+// CHECK-LABEL: @test_get_group_id(
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.x()
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.y()
+// CHECK: tail call i32 @llvm.amdgcn.workgroup.id.z()
+void test_get_group_id(int d, global int *out)
+{
+	switch (d) {
+	case 0: *out = __builtin_amdgcn_workgroup_id_x(); break;
+	case 1: *out = __builtin_amdgcn_workgroup_id_y(); break;
+	case 2: *out = __builtin_amdgcn_workgroup_id_z(); break;
+	default: *out = 0;
+	}
+}
+
+// CHECK-LABEL: @test_get_local_id(
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.x(), !range [[WI_RANGE:![0-9]*]]
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.y(), !range [[WI_RANGE]]
+// CHECK: tail call i32 @llvm.amdgcn.workitem.id.z(), !range [[WI_RANGE]]
+void test_get_local_id(int d, global int *out)
+{
+	switch (d) {
+	case 0: *out = __builtin_amdgcn_workitem_id_x(); break;
+	case 1: *out = __builtin_amdgcn_workitem_id_y(); break;
+	case 2: *out = __builtin_amdgcn_workitem_id_z(); break;
+	default: *out = 0;
+	}
+}
+
+// CHECK-DAG: [[WI_RANGE]] = !{i32 0, i32 1024}
+// CHECK-DAG: attributes #[[NOUNWIND_READONLY:[0-9]+]] = { nounwind readonly }
+// CHECK-DAG: attributes #[[READ_EXEC_ATTRS]] = { convergent }
+// CHECK-DAG: ![[EXEC]] = !{!"exec"}
diff --git a/test/CodeGenOpenCL/builtins-generic-amdgcn.cl b/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
new file mode 100644
index 0000000000000..5a4756bacb858
--- /dev/null
+++ b/test/CodeGenOpenCL/builtins-generic-amdgcn.cl
@@ -0,0 +1,16 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-LABEL: @test_builtin_clz(
+// CHECK: tail call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+void test_builtin_clz(global int* out, int a)
+{
+  *out = __builtin_clz(a);
+}
+
+// CHECK-LABEL: @test_builtin_clzl(
+// CHECK: tail call i64 @llvm.ctlz.i64(i64 %a, i1 true)
+void test_builtin_clzl(global long* out, long a)
+{
+  *out = __builtin_clzl(a);
+}
diff --git a/test/CodeGenOpenCL/builtins-r600.cl b/test/CodeGenOpenCL/builtins-r600.cl
index 3e416b0323c20..027a54a6bce29 100644
--- a/test/CodeGenOpenCL/builtins-r600.cl
+++ b/test/CodeGenOpenCL/builtins-r600.cl
@@ -1,143 +1,55 @@
-// REQUIRES: r600-registered-target
-// RUN: %clang_cc1 -triple r600-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -S -emit-llvm -o - %s | FileCheck %s
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple r600-unknown-unknown -target-cpu cypress -S -emit-llvm -o - %s | FileCheck %s
 
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-// CHECK-LABEL: @test_div_scale_f64
-// CHECK: call { double, i1 } @llvm.AMDGPU.div.scale.f64(double %a, double %b, i1 true)
-// CHECK-DAG: [[FLAG:%.+]] = extractvalue { double, i1 } %{{.+}}, 1
-// CHECK-DAG: [[VAL:%.+]] = extractvalue { double, i1 } %{{.+}}, 0
-// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
-// CHECK: store i32 [[FLAGEXT]]
-void test_div_scale_f64(global double* out, global int* flagout, double a, double b)
-{
-  bool flag;
-  *out = __builtin_amdgpu_div_scale(a, b, true, &flag);
-  *flagout = flag;
-}
-
-// CHECK-LABEL: @test_div_scale_f32
-// CHECK: call { float, i1 } @llvm.AMDGPU.div.scale.f32(float %a, float %b, i1 true)
-// CHECK-DAG: [[FLAG:%.+]] = extractvalue { float, i1 } %{{.+}}, 1
-// CHECK-DAG: [[VAL:%.+]] = extractvalue { float, i1 } %{{.+}}, 0
-// CHECK: [[FLAGEXT:%.+]] = zext i1 [[FLAG]] to i32
-// CHECK: store i32 [[FLAGEXT]]
-void test_div_scale_f32(global float* out, global int* flagout, float a, float b)
-{
-  bool flag;
-  *out = __builtin_amdgpu_div_scalef(a, b, true, &flag);
-  *flagout = flag;
-}
-
-// CHECK-LABEL: @test_div_fmas_f32
-// CHECK: call float @llvm.AMDGPU.div.fmas.f32
-void test_div_fmas_f32(global float* out, float a, float b, float c, int d)
-{
-  *out = __builtin_amdgpu_div_fmasf(a, b, c, d);
-}
-
-// CHECK-LABEL: @test_div_fmas_f64
-// CHECK: call double @llvm.AMDGPU.div.fmas.f64
-void test_div_fmas_f64(global double* out, double a, double b, double c, int d)
+// CHECK-LABEL: @test_recipsqrt_ieee_f32
+// CHECK: call float @llvm.r600.recipsqrt.ieee.f32
+void test_recipsqrt_ieee_f32(global float* out, float a)
 {
-  *out = __builtin_amdgpu_div_fmas(a, b, c, d);
+  *out = __builtin_r600_recipsqrt_ieeef(a);
 }
 
-// CHECK-LABEL: @test_div_fixup_f32
-// CHECK: call float @llvm.AMDGPU.div.fixup.f32
-void test_div_fixup_f32(global float* out, float a, float b, float c)
+#if cl_khr_fp64
+// XCHECK-LABEL: @test_recipsqrt_ieee_f64
+// XCHECK: call double @llvm.r600.recipsqrt.ieee.f64
+void test_recipsqrt_ieee_f64(global double* out, double a)
 {
-  *out = __builtin_amdgpu_div_fixupf(a, b, c);
+  *out = __builtin_r600_recipsqrt_ieee(a);
 }
+#endif
 
-// CHECK-LABEL: @test_div_fixup_f64
-// CHECK: call double @llvm.AMDGPU.div.fixup.f64
-void test_div_fixup_f64(global double* out, double a, double b, double c)
+// CHECK-LABEL: @test_implicitarg_ptr
+// CHECK: call i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
+void test_implicitarg_ptr(__attribute__((address_space(7))) unsigned char ** out)
 {
-  *out = __builtin_amdgpu_div_fixup(a, b, c);
+  *out = __builtin_r600_implicitarg_ptr();
 }
 
-// CHECK-LABEL: @test_trig_preop_f32
-// CHECK: call float @llvm.AMDGPU.trig.preop.f32
-void test_trig_preop_f32(global float* out, float a, int b)
+// CHECK-LABEL: @test_get_group_id(
+// CHECK: tail call i32 @llvm.r600.read.tgid.x()
+// CHECK: tail call i32 @llvm.r600.read.tgid.y()
+// CHECK: tail call i32 @llvm.r600.read.tgid.z()
+void test_get_group_id(int d, global int *out)
 {
-  *out = __builtin_amdgpu_trig_preopf(a, b);
+	switch (d) {
+	case 0: *out = __builtin_r600_read_tgid_x(); break;
+	case 1: *out = __builtin_r600_read_tgid_y(); break;
+	case 2: *out = __builtin_r600_read_tgid_z(); break;
+	default: *out = 0;
+	}
 }
 
-// CHECK-LABEL: @test_trig_preop_f64
-// CHECK: call double @llvm.AMDGPU.trig.preop.f64
-void test_trig_preop_f64(global double* out, double a, int b)
+// CHECK-LABEL: @test_get_local_id(
+// CHECK: tail call i32 @llvm.r600.read.tidig.x(), !range [[WI_RANGE:![0-9]*]]
+// CHECK: tail call i32 @llvm.r600.read.tidig.y(), !range [[WI_RANGE]]
+// CHECK: tail call i32 @llvm.r600.read.tidig.z(), !range [[WI_RANGE]]
+void test_get_local_id(int d, global int *out)
 {
-  *out = __builtin_amdgpu_trig_preop(a, b);
+	switch (d) {
+	case 0: *out = __builtin_r600_read_tidig_x(); break;
+	case 1: *out = __builtin_r600_read_tidig_y(); break;
+	case 2: *out = __builtin_r600_read_tidig_z(); break;
+	default: *out = 0;
+	}
 }
 
-// CHECK-LABEL: @test_rcp_f32
-// CHECK: call float @llvm.AMDGPU.rcp.f32
-void test_rcp_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rcpf(a);
-}
-
-// CHECK-LABEL: @test_rcp_f64
-// CHECK: call double @llvm.AMDGPU.rcp.f64
-void test_rcp_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rcp(a);
-}
-
-// CHECK-LABEL: @test_rsq_f32
-// CHECK: call float @llvm.AMDGPU.rsq.f32
-void test_rsq_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rsqf(a);
-}
-
-// CHECK-LABEL: @test_rsq_f64
-// CHECK: call double @llvm.AMDGPU.rsq.f64
-void test_rsq_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rsq(a);
-}
-
-// CHECK-LABEL: @test_rsq_clamped_f32
-// CHECK: call float @llvm.AMDGPU.rsq.clamped.f32
-void test_rsq_clamped_f32(global float* out, float a)
-{
-  *out = __builtin_amdgpu_rsq_clampedf(a);
-}
-
-// CHECK-LABEL: @test_rsq_clamped_f64
-// CHECK: call double @llvm.AMDGPU.rsq.clamped.f64
-void test_rsq_clamped_f64(global double* out, double a)
-{
-  *out = __builtin_amdgpu_rsq_clamped(a);
-}
-
-// CHECK-LABEL: @test_ldexp_f32
-// CHECK: call float @llvm.AMDGPU.ldexp.f32
-void test_ldexp_f32(global float* out, float a, int b)
-{
-  *out = __builtin_amdgpu_ldexpf(a, b);
-}
-
-// CHECK-LABEL: @test_ldexp_f64
-// CHECK: call double @llvm.AMDGPU.ldexp.f64
-void test_ldexp_f64(global double* out, double a, int b)
-{
-  *out = __builtin_amdgpu_ldexp(a, b);
-}
-
-// CHECK-LABEL: @test_class_f32
-// CHECK: call i1 @llvm.AMDGPU.class.f32
-void test_class_f32(global float* out, float a, int b)
-{
-  *out = __builtin_amdgpu_classf(a, b);
-}
-
-// CHECK-LABEL: @test_class_f64
-// CHECK: call i1 @llvm.AMDGPU.class.f64
-void test_class_f64(global double* out, double a, int b)
-{
-  *out = __builtin_amdgpu_class(a, b);
-}
+// CHECK-DAG: [[WI_RANGE]] = !{i32 0, i32 1024}
diff --git a/test/CodeGenOpenCL/cl20-device-side-enqueue.cl b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
new file mode 100644
index 0000000000000..08b24bdc5bec6
--- /dev/null
+++ b/test/CodeGenOpenCL/cl20-device-side-enqueue.cl
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -ffake-address-space-map -O0 -emit-llvm -o - | FileCheck %s
+
+typedef void (^bl_t)(local void *);
+
+const bl_t block_G = (bl_t) ^ (local void *a) {};
+
+kernel void device_side_enqueue(global int *a, global int *b, int i) {
+  // CHECK: %default_queue = alloca %opencl.queue_t*
+  queue_t default_queue;
+  // CHECK: %flags = alloca i32
+  unsigned flags = 0;
+  // CHECK: %ndrange = alloca %opencl.ndrange_t*
+  ndrange_t ndrange;
+  // CHECK: %clk_event = alloca %opencl.clk_event_t*
+  clk_event_t clk_event;
+  // CHECK: %event_wait_list = alloca %opencl.clk_event_t*
+  clk_event_t event_wait_list;
+  // CHECK: %event_wait_list2 = alloca [1 x %opencl.clk_event_t*]
+  clk_event_t event_wait_list2[] = {clk_event};
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block to void ()*
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__enqueue_kernel_basic(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* [[BL_I8]])
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(void) {
+                   a[i] = b[i];
+                 });
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[BL:%[0-9]+]] = bitcast <{ i8*, i32, i32, i8*, %struct.__block_descriptor*, i32{{.*}}, i32{{.*}}, i32{{.*}} }>* %block3 to void ()*
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__enqueue_kernel_basic_events(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** %event_wait_list, %opencl.clk_event_t** %clk_event, i8* [[BL_I8]])
+  enqueue_kernel(default_queue, flags, ndrange, 2, &event_wait_list, &clk_event,
+                 ^(void) {
+                   a[i] = b[i];
+                 });
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i8*, i32, ...) @__enqueue_kernel_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 256)
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *p) {
+                   return;
+                 },
+                 256);
+  char c;
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[SIZE:%[0-9]+]] = zext i8 {{%[0-9]+}} to i32
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i8*, i32, ...) @__enqueue_kernel_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 [[SIZE]])
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *p) {
+                   return;
+                 },
+                 c);
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, i8*, i32, ...) @__enqueue_kernel_events_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** [[AD]], %opencl.clk_event_t** %clk_event, i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 256)
+  enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
+                 ^(local void *p) {
+                   return;
+                 },
+                 256);
+
+  // CHECK: [[DEF_Q:%[0-9]+]] = load %opencl.queue_t*, %opencl.queue_t** %default_queue
+  // CHECK: [[FLAGS:%[0-9]+]] = load i32, i32* %flags
+  // CHECK: [[NDR:%[0-9]+]] = load %opencl.ndrange_t*, %opencl.ndrange_t** %ndrange
+  // CHECK: [[AD:%arraydecay[0-9]*]] = getelementptr inbounds [1 x %opencl.clk_event_t*], [1 x %opencl.clk_event_t*]* %event_wait_list2, i32 0, i32 0
+  // CHECK: [[SIZE:%[0-9]+]] = zext i8 {{%[0-9]+}} to i32
+  // CHECK: call i32 (%opencl.queue_t*, i32, %opencl.ndrange_t*, i32, %opencl.clk_event_t**, %opencl.clk_event_t**, i8*, i32, ...) @__enqueue_kernel_events_vaargs(%opencl.queue_t* [[DEF_Q]], i32 [[FLAGS]], %opencl.ndrange_t* [[NDR]], i32 2, %opencl.clk_event_t** [[AD]], %opencl.clk_event_t** %clk_event, i8* bitcast ({ i8**, i32, i32, i8*, %struct.__block_descriptor* }* @__block_literal_global{{(.[0-9]+)?}} to i8*), i32 1, i32 [[SIZE]])
+  enqueue_kernel(default_queue, flags, ndrange, 2, event_wait_list2, &clk_event,
+                 ^(local void *p) {
+                   return;
+                 },
+                 c);
+
+  void (^const block_A)(void) = ^{
+    return;
+  };
+  void (^const block_B)(local void *) = ^(local void *a) {
+    return;
+  };
+
+  // CHECK: [[BL:%[0-9]+]] = load void ()*, void ()** %block_A
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_work_group_size_impl(i8* [[BL_I8]])
+  unsigned size = get_kernel_work_group_size(block_A);
+  // CHECK: [[BL:%[0-9]+]] = load void (i8 addrspace(2)*)*, void (i8 addrspace(2)*)** %block_B
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void (i8 addrspace(2)*)* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_work_group_size_impl(i8* [[BL_I8]])
+  size = get_kernel_work_group_size(block_B);
+  // CHECK: [[BL:%[0-9]+]] = load void ()*, void ()** %block_A
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void ()* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8* [[BL_I8]])
+  size = get_kernel_preferred_work_group_size_multiple(block_A);
+  // CHECK: [[BL:%[0-9]+]] = load void (i8 addrspace(2)*)*, void (i8 addrspace(2)*)* addrspace(1)* @block_G
+  // CHECK: [[BL_I8:%[0-9]+]] = bitcast void (i8 addrspace(2)*)* [[BL]] to i8*
+  // CHECK: call i32 @__get_kernel_preferred_work_group_multiple_impl(i8* [[BL_I8]])
+  size = get_kernel_preferred_work_group_size_multiple(block_G);
+}
diff --git a/test/CodeGenOpenCL/constant-addr-space-globals.cl b/test/CodeGenOpenCL/constant-addr-space-globals.cl
index 92fb9790b5fb2..f81a51458c428 100644
--- a/test/CodeGenOpenCL/constant-addr-space-globals.cl
+++ b/test/CodeGenOpenCL/constant-addr-space-globals.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -ffake-address-space-map -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -cl-opt-disable -ffake-address-space-map -emit-llvm -o - | FileCheck %s
 
 // CHECK: @array = addrspace({{[0-9]+}}) constant
 __constant float array[2] = {0.0f, 1.0f};
diff --git a/test/CodeGenOpenCL/event_t.cl b/test/CodeGenOpenCL/event_t.cl
index a84d8bb610c07..aad441f35fd7a 100644
--- a/test/CodeGenOpenCL/event_t.cl
+++ b/test/CodeGenOpenCL/event_t.cl
@@ -9,4 +9,6 @@ void kernel ker() {
 // CHECK: call {{.*}}void @foo(%opencl.event_t* %
   foo(0);
 // CHECK: call {{.*}}void @foo(%opencl.event_t* null)
+  foo((event_t)0);
+// CHECK: call {{.*}}void @foo(%opencl.event_t* null)
 }
diff --git a/test/CodeGenOpenCL/fpmath.cl b/test/CodeGenOpenCL/fpmath.cl
index ef4da845529c6..88df3bf166abb 100644
--- a/test/CodeGenOpenCL/fpmath.cl
+++ b/test/CodeGenOpenCL/fpmath.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 
 typedef __attribute__(( ext_vector_type(4) )) float float4;
 
diff --git a/test/CodeGenOpenCL/half.cl b/test/CodeGenOpenCL/half.cl
index bd5ae7f649900..9acabf0a2a83c 100644
--- a/test/CodeGenOpenCL/half.cl
+++ b/test/CodeGenOpenCL/half.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/test/CodeGenOpenCL/images.cl b/test/CodeGenOpenCL/images.cl
new file mode 100644
index 0000000000000..eb054eceb5df0
--- /dev/null
+++ b/test/CodeGenOpenCL/images.cl
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -O0 -emit-llvm -o - | FileCheck %s
+
+__attribute__((overloadable)) void read_image(read_only image1d_t img_ro);
+__attribute__((overloadable)) void read_image(write_only image1d_t img_wo);
+
+kernel void test_read_image(read_only image1d_t img_ro, write_only image1d_t img_wo) {
+  // CHECK: call void @_Z10read_image14ocl_image1d_ro(%opencl.image1d_ro_t* %{{[0-9]+}})
+  read_image(img_ro);
+  // CHECK: call void @_Z10read_image14ocl_image1d_wo(%opencl.image1d_wo_t* %{{[0-9]+}})
+  read_image(img_wo);
+}
diff --git a/test/CodeGenOpenCL/kernel-arg-info.cl b/test/CodeGenOpenCL/kernel-arg-info.cl
index 4bc191e1d75fe..5a5c8f9d6c5f6 100644
--- a/test/CodeGenOpenCL/kernel-arg-info.cl
+++ b/test/CodeGenOpenCL/kernel-arg-info.cl
@@ -1,55 +1,88 @@
-// RUN: %clang_cc1 %s -cl-kernel-arg-info -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s -check-prefix ARGINFO
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s -check-prefix NO-ARGINFO
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple spir-unknown-unknown -cl-kernel-arg-info | FileCheck %s -check-prefix ARGINFO
 
 kernel void foo(__global int * restrict X, const int Y, 
                 volatile int anotherArg, __constant float * restrict Z) {
   *X = Y + anotherArg;
 }
-
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 0, i32 0, i32 2}
-// CHECK:  !{!"kernel_arg_access_qual", !"none", !"none", !"none", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"int*", !"int", !"int", !"float*"}
-// CHECK:  !{!"kernel_arg_base_type", !"int*", !"int", !"int", !"float*"}
-// CHECK:  !{!"kernel_arg_type_qual", !"restrict", !"const", !"volatile", !"restrict const"}
-// ARGINFO: !{!"kernel_arg_name", !"X", !"Y", !"anotherArg", !"Z"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X", !"Y", !"anotherArg", !"Z"}
+// CHECK: define spir_kernel void @foo{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD11:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD12:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD13:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD13]]
+// CHECK: !kernel_arg_type_qual ![[MD14:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD15:[0-9]+]]
 
 kernel void foo2(read_only image1d_t img1, image2d_t img2, write_only image2d_array_t img3) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 1, i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"read_only", !"read_only", !"write_only"}
-// CHECK:  !{!"kernel_arg_type", !"image1d_t", !"image2d_t", !"image2d_array_t"}
-// CHECK:  !{!"kernel_arg_base_type", !"image1d_t", !"image2d_t", !"image2d_array_t"}
-// CHECK:  !{!"kernel_arg_type_qual", !"", !"", !""}
-// ARGINFO: !{!"kernel_arg_name", !"img1", !"img2", !"img3"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"img1", !"img2", !"img3"}
+// CHECK: define spir_kernel void @foo2{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD21:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD22:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD23:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD23]]
+// CHECK: !kernel_arg_type_qual ![[MD24:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD25:[0-9]+]]
 
 kernel void foo3(__global half * X) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"half*"}
-// CHECK:  !{!"kernel_arg_base_type", !"half*"}
-// CHECK:  !{!"kernel_arg_type_qual", !""}
-// ARGINFO: !{!"kernel_arg_name", !"X"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X"}
+// CHECK: define spir_kernel void @foo3{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD31:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD32:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD33:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD33]]
+// CHECK: !kernel_arg_type_qual ![[MD34:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD35:[0-9]+]]
 
 typedef unsigned int myunsignedint;
 kernel void foo4(__global unsigned int * X, __global myunsignedint * Y) {
 }
-// CHECK:  !{!"kernel_arg_addr_space", i32 1, i32 1}
-// CHECK:  !{!"kernel_arg_access_qual", !"none", !"none"}
-// CHECK:  !{!"kernel_arg_type", !"uint*", !"myunsignedint*"}
-// CHECK:  !{!"kernel_arg_base_type", !"uint*", !"uint*"}
-// CHECK:  !{!"kernel_arg_type_qual", !"", !""}
-// ARGINFO: !{!"kernel_arg_name", !"X", !"Y"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"X", !"Y"}
+// CHECK: define spir_kernel void @foo4{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD42:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD43:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD44:[0-9]+]]
+// CHECK: !kernel_arg_type_qual ![[MD45:[0-9]+]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD46:[0-9]+]]
 
 typedef image1d_t myImage;
-kernel void foo5(read_only myImage img1, write_only image1d_t img2) {
+kernel void foo5(myImage img1, write_only image1d_t img2) {
 }
-// CHECK:  !{!"kernel_arg_access_qual", !"read_only", !"write_only"}
-// CHECK:  !{!"kernel_arg_type", !"myImage", !"image1d_t"}
-// CHECK:  !{!"kernel_arg_base_type", !"image1d_t", !"image1d_t"}
-// ARGINFO: !{!"kernel_arg_name", !"img1", !"img2"}
-// NO-ARGINFO-NOT: !{!"kernel_arg_name", !"img1", !"img2"}
+// CHECK: define spir_kernel void @foo5{{[^!]+}}
+// CHECK: !kernel_arg_addr_space ![[MD41:[0-9]+]]
+// CHECK: !kernel_arg_access_qual ![[MD51:[0-9]+]]
+// CHECK: !kernel_arg_type ![[MD52:[0-9]+]]
+// CHECK: !kernel_arg_base_type ![[MD53:[0-9]+]]
+// CHECK: !kernel_arg_type_qual ![[MD45]]
+// CHECK-NOT: !kernel_arg_name
+// ARGINFO: !kernel_arg_name ![[MD54:[0-9]+]]
+
+// CHECK: ![[MD11]] = !{i32 1, i32 0, i32 0, i32 2}
+// CHECK: ![[MD12]] = !{!"none", !"none", !"none", !"none"}
+// CHECK: ![[MD13]] = !{!"int*", !"int", !"int", !"float*"}
+// CHECK: ![[MD14]] = !{!"restrict", !"const", !"volatile", !"restrict const"}
+// ARGINFO: ![[MD15]] = !{!"X", !"Y", !"anotherArg", !"Z"}
+// CHECK: ![[MD21]] = !{i32 1, i32 1, i32 1}
+// CHECK: ![[MD22]] = !{!"read_only", !"read_only", !"write_only"}
+// CHECK: ![[MD23]] = !{!"__read_only image1d_t", !"__read_only image2d_t", !"__write_only image2d_array_t"}
+// CHECK: ![[MD24]] = !{!"", !"", !""}
+// ARGINFO: ![[MD25]] = !{!"img1", !"img2", !"img3"}
+// CHECK: ![[MD31]] = !{i32 1}
+// CHECK: ![[MD32]] = !{!"none"}
+// CHECK: ![[MD33]] = !{!"half*"}
+// CHECK: ![[MD34]] = !{!""}
+// ARGINFO: ![[MD35]] = !{!"X"}
+// CHECK: ![[MD41]] = !{i32 1, i32 1}
+// CHECK: ![[MD42]] = !{!"none", !"none"}
+// CHECK: ![[MD43]] = !{!"uint*", !"myunsignedint*"}
+// CHECK: ![[MD44]] = !{!"uint*", !"uint*"}
+// CHECK: ![[MD45]] = !{!"", !""}
+// ARGINFO: ![[MD46]] = !{!"X", !"Y"}
+// CHECK: ![[MD51]] = !{!"read_only", !"write_only"}
+// CHECK: ![[MD52]] = !{!"myImage", !"__write_only image1d_t"}
+// CHECK: ![[MD53]] = !{!"__read_only image1d_t", !"__write_only image1d_t"}
+// ARGINFO: ![[MD54]] = !{!"img1", !"img2"}
+
diff --git a/test/CodeGenOpenCL/kernel-attributes.cl b/test/CodeGenOpenCL/kernel-attributes.cl
index 8f22d611b820b..4a116dd085511 100644
--- a/test/CodeGenOpenCL/kernel-attributes.cl
+++ b/test/CodeGenOpenCL/kernel-attributes.cl
@@ -3,14 +3,12 @@
 typedef unsigned int uint4 __attribute__((ext_vector_type(4)));
 
 kernel  __attribute__((vec_type_hint(int))) __attribute__((reqd_work_group_size(1,2,4))) void kernel1(int a) {}
+// CHECK: define void @kernel1(i32 {{[^%]*}}%a) {{[^{]+}} !vec_type_hint ![[MD1:[0-9]+]] !reqd_work_group_size ![[MD2:[0-9]+]]
 
 kernel __attribute__((vec_type_hint(uint4))) __attribute__((work_group_size_hint(8,16,32))) void kernel2(int a) {}
+// CHECK: define void @kernel2(i32 {{[^%]*}}%a) {{[^{]+}} !vec_type_hint ![[MD3:[0-9]+]] !work_group_size_hint ![[MD4:[0-9]+]]
 
-// CHECK: opencl.kernels = !{[[MDNODE0:![0-9]+]], [[MDNODE3:![0-9]+]]}
-
-// CHECK: [[MDNODE0]] = !{void (i32)* @kernel1, {{.*}} [[MDNODE1:![0-9]+]], [[MDNODE2:![0-9]+]]}
-// CHECK: [[MDNODE1]] = !{!"vec_type_hint", i32 undef, i32 1}
-// CHECK: [[MDNODE2]] = !{!"reqd_work_group_size", i32 1, i32 2, i32 4}
-// CHECK: [[MDNODE3]] = !{void (i32)* @kernel2, {{.*}} [[MDNODE4:![0-9]+]], [[MDNODE5:![0-9]+]]}
-// CHECK: [[MDNODE4]] = !{!"vec_type_hint", <4 x i32> undef, i32 0}
-// CHECK: [[MDNODE5]] = !{!"work_group_size_hint", i32 8, i32 16, i32 32}
+// CHECK: [[MD1]] = !{i32 undef, i32 1}
+// CHECK: [[MD2]] = !{i32 1, i32 2, i32 4}
+// CHECK: [[MD3]] = !{<4 x i32> undef, i32 0}
+// CHECK: [[MD4]] = !{i32 8, i32 16, i32 32}
diff --git a/test/CodeGenOpenCL/kernel-metadata.cl b/test/CodeGenOpenCL/kernel-metadata.cl
index ef3758fccaa07..4165f1fa0ce58 100644
--- a/test/CodeGenOpenCL/kernel-metadata.cl
+++ b/test/CodeGenOpenCL/kernel-metadata.cl
@@ -6,10 +6,5 @@ void normal_function() {
 __kernel void kernel_function() {
 }
 
-// CHECK: !opencl.kernels = !{!0}
-// CHECK: !0 = !{void ()* @kernel_function, !1, !2, !3, !4, !5}
-// CHECK: !1 = !{!"kernel_arg_addr_space"}
-// CHECK: !2 = !{!"kernel_arg_access_qual"}
-// CHECK: !3 = !{!"kernel_arg_type"}
-// CHECK: !4 = !{!"kernel_arg_base_type"}
-// CHECK: !5 = !{!"kernel_arg_type_qual"}
+// CHECK: define void @kernel_function() {{[^{]+}} !kernel_arg_addr_space ![[MD:[0-9]+]] !kernel_arg_access_qual ![[MD]] !kernel_arg_type ![[MD]] !kernel_arg_base_type ![[MD]] !kernel_arg_type_qual ![[MD]] {
+// CHECK: ![[MD]] = !{}
diff --git a/test/CodeGenOpenCL/no-signed-zeros.cl b/test/CodeGenOpenCL/no-signed-zeros.cl
new file mode 100644
index 0000000000000..14f6411b35619
--- /dev/null
+++ b/test/CodeGenOpenCL/no-signed-zeros.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s -check-prefix=NORMAL
+// RUN: %clang_cc1 %s -emit-llvm -cl-no-signed-zeros -o - | FileCheck %s -check-prefix=NO-SIGNED-ZEROS
+
+float signedzeros(float a) {
+  return a;
+}
+
+// CHECK: attributes
+// NORMAL: "no-signed-zeros-fp-math"="false"
+// NO-SIGNED-ZEROS: "no-signed-zeros-fp-math"="true"
diff --git a/test/CodeGenOpenCL/opencl_types.cl b/test/CodeGenOpenCL/opencl_types.cl
index 5f4ebb8a283e5..5691b27fbe526 100644
--- a/test/CodeGenOpenCL/opencl_types.cl
+++ b/test/CodeGenOpenCL/opencl_types.cl
@@ -4,37 +4,37 @@ constant sampler_t glb_smp = 7;
 // CHECK: constant i32 7
 
 void fnc1(image1d_t img) {}
-// CHECK: @fnc1(%opencl.image1d_t*
+// CHECK: @fnc1(%opencl.image1d_ro_t*
 
 void fnc1arr(image1d_array_t img) {}
-// CHECK: @fnc1arr(%opencl.image1d_array_t*
+// CHECK: @fnc1arr(%opencl.image1d_array_ro_t*
 
 void fnc1buff(image1d_buffer_t img) {}
-// CHECK: @fnc1buff(%opencl.image1d_buffer_t*
+// CHECK: @fnc1buff(%opencl.image1d_buffer_ro_t*
 
 void fnc2(image2d_t img) {}
-// CHECK: @fnc2(%opencl.image2d_t*
+// CHECK: @fnc2(%opencl.image2d_ro_t*
 
 void fnc2arr(image2d_array_t img) {}
-// CHECK: @fnc2arr(%opencl.image2d_array_t*
+// CHECK: @fnc2arr(%opencl.image2d_array_ro_t*
 
 void fnc3(image3d_t img) {}
-// CHECK: @fnc3(%opencl.image3d_t*
+// CHECK: @fnc3(%opencl.image3d_ro_t*
 
 void fnc4smp(sampler_t s) {}
 // CHECK-LABEL: define {{.*}}void @fnc4smp(i32
 
 kernel void foo(image1d_t img) {
-	sampler_t smp = 5;
-// CHECK: alloca i32
-	event_t evt;
-// CHECK: alloca %opencl.event_t*
-// CHECK: store i32 5,
+  sampler_t smp = 5;
+  // CHECK: alloca i32
+  event_t evt;
+  // CHECK: alloca %opencl.event_t*
+  // CHECK: store i32 5,
   fnc4smp(smp);
-// CHECK: call {{.*}}void @fnc4smp(i32
+  // CHECK: call {{.*}}void @fnc4smp(i32
   fnc4smp(glb_smp);
-// CHECK: call {{.*}}void @fnc4smp(i32
+  // CHECK: call {{.*}}void @fnc4smp(i32
 }
 
-void __attribute__((overloadable)) bad1(image1d_t *b, image2d_t *c, image2d_t *d) {}
-// CHECK-LABEL: @{{_Z4bad1P11ocl_image1dP11ocl_image2dS2_|"\\01\?bad1@@\$\$J0YAXPE?APAUocl_image1d@@PE?APAUocl_image2d@@1@Z"}}
+void __attribute__((overloadable)) bad1(image1d_t b, image2d_t c, image2d_t d) {}
+// CHECK-LABEL: @{{_Z4bad114ocl_image1d_ro14ocl_image2d_roS0_|"\\01\?bad1@@\$\$J0YAXPAUocl_image1d_ro@@PAUocl_image2d_ro@@1@Z"}}
diff --git a/test/CodeGenOpenCL/pipe_builtin.cl b/test/CodeGenOpenCL/pipe_builtin.cl
new file mode 100644
index 0000000000000..db6893ebf55d6
--- /dev/null
+++ b/test/CodeGenOpenCL/pipe_builtin.cl
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+// CHECK: %opencl.pipe_t = type opaque
+// CHECK: %opencl.reserve_id_t = type opaque
+
+void test1(read_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  read_pipe(p, ptr);
+  // CHECK: call %opencl.reserve_id_t* @__reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = reserve_read_pipe(p, 2);
+  // CHECK: call i32 @__read_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}})
+  read_pipe(p, rid, 2, ptr);
+  // CHECK: call void @__commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  commit_read_pipe(p, rid);
+}
+
+void test2(write_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__write_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  write_pipe(p, ptr);
+  // CHECK: call %opencl.reserve_id_t* @__reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = reserve_write_pipe(p, 2);
+  // CHECK: call i32 @__write_pipe_4(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}}, i32 {{.*}}, i8* %{{.*}})
+  write_pipe(p, rid, 2, ptr);
+  // CHECK: call void @__commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  commit_write_pipe(p, rid);
+}
+
+void test3(read_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
+  // CHECK: call void @__work_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  work_group_commit_read_pipe(p, rid);
+}
+
+void test4(write_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__work_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
+  // CHECK: call void @__work_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  work_group_commit_write_pipe(p, rid);
+}
+
+void test5(read_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_read_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
+  // CHECK: call void @__sub_group_commit_read_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  sub_group_commit_read_pipe(p, rid);
+}
+
+void test6(write_only pipe int p, global int *ptr) {
+  // CHECK: call %opencl.reserve_id_t* @__sub_group_reserve_write_pipe(%opencl.pipe_t* %{{.*}}, i32 {{.*}})
+  reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
+  // CHECK: call void @__sub_group_commit_write_pipe(%opencl.pipe_t* %{{.*}}, %opencl.reserve_id_t* %{{.*}})
+  sub_group_commit_write_pipe(p, rid);
+}
+
+void test7(write_only pipe int p, global int *ptr) {
+  // CHECK: call i32 @__get_pipe_num_packets(%opencl.pipe_t* %{{.*}})
+  *ptr = get_pipe_num_packets(p);
+  // CHECK: call i32 @__get_pipe_max_packets(%opencl.pipe_t* %{{.*}})
+  *ptr = get_pipe_max_packets(p);
+}
diff --git a/test/CodeGenOpenCL/pipe_types.cl b/test/CodeGenOpenCL/pipe_types.cl
index 9d8d5274a3ba5..b9c411b5fcb6a 100644
--- a/test/CodeGenOpenCL/pipe_types.cl
+++ b/test/CodeGenOpenCL/pipe_types.cl
@@ -25,3 +25,23 @@ void test4(read_only pipe uchar3 p) {
 void test5(read_only pipe int4 p) {
 // CHECK: define void @test5(%opencl.pipe_t* %p)
 }
+
+typedef read_only pipe int MyPipe;
+kernel void test6(MyPipe p) {
+// CHECK: define void @test6(%opencl.pipe_t* %p)
+}
+
+struct Person {
+  const char *Name;
+  bool isFemale;
+  int ID;
+};
+
+void test_reserved_read_pipe(global struct Person *SDst,
+                             read_only pipe struct Person SPipe) {
+// CHECK: define void @test_reserved_read_pipe
+  read_pipe (SPipe, SDst);
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+  read_pipe (SPipe, SDst);
+  // CHECK: call i32 @__read_pipe_2(%opencl.pipe_t* %{{.*}}, i8* %{{.*}})
+}
diff --git a/test/CodeGenOpenCL/shifts.cl b/test/CodeGenOpenCL/shifts.cl
index ab64051a01ce8..14cd7aff65bec 100644
--- a/test/CodeGenOpenCL/shifts.cl
+++ b/test/CodeGenOpenCL/shifts.cl
@@ -5,7 +5,7 @@
 // bits before evaluating. Test this both for variables and constants
 // evaluated in the front-end.
 
-// OPT: @gtest1 = constant i64 2147483648
+// OPT: @gtest1 = local_unnamed_addr constant i64 2147483648
 __constant const unsigned long gtest1 = 1UL << 31;
 
 // NOOPT: @negativeShift32
diff --git a/test/CodeGenOpenCL/spir_version.cl b/test/CodeGenOpenCL/spir_version.cl
new file mode 100644
index 0000000000000..215b4d7a40238
--- /dev/null
+++ b/test/CodeGenOpenCL/spir_version.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - | FileCheck %s --check-prefix=CL10
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - -cl-std=CL1.2 | FileCheck %s --check-prefix=CL12
+// RUN: %clang_cc1 %s -triple "spir-unknown-unknown" -emit-llvm -o - -cl-std=CL2.0 | FileCheck %s --check-prefix=CL20
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - | FileCheck %s --check-prefix=CL10
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - -cl-std=CL1.2 | FileCheck %s --check-prefix=CL12
+// RUN: %clang_cc1 %s -triple "spir64-unknown-unknown" -emit-llvm -o - -cl-std=CL2.0 | FileCheck %s --check-prefix=CL20
+kernel void foo() {}
+// CL10: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CL10: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CL10: [[SPIR]] = !{i32 2, i32 0}
+// CL10: [[OCL]] = !{i32 1, i32 0}
+// CL12: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CL12: !opencl.ocl.version = !{[[OCL:![0-9]+]]}
+// CL12: [[SPIR]] = !{i32 2, i32 0}
+// CL12: [[OCL]] = !{i32 1, i32 2}
+// CL20: !opencl.spir.version = !{[[SPIR:![0-9]+]]}
+// CL20: !opencl.ocl.version = !{[[SPIR:![0-9]+]]}
+// CL20: [[SPIR]] = !{i32 2, i32 0}
diff --git a/test/CodeGenOpenCL/str_literals.cl b/test/CodeGenOpenCL/str_literals.cl
index 092b6372a414d..1c0acd155dac3 100644
--- a/test/CodeGenOpenCL/str_literals.cl
+++ b/test/CodeGenOpenCL/str_literals.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -ffake-address-space-map | FileCheck %s
+// RUN: %clang_cc1 %s -cl-opt-disable -emit-llvm -o - -ffake-address-space-map | FileCheck %s
 
 __constant char * __constant x = "hello world";
 __constant char * __constant y = "hello world";
diff --git a/test/CodeGenOpenCL/to_addr_builtin.cl b/test/CodeGenOpenCL/to_addr_builtin.cl
new file mode 100644
index 0000000000000..67475d565e3a1
--- /dev/null
+++ b/test/CodeGenOpenCL/to_addr_builtin.cl
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+// CHECK: %[[A:.*]] = type { float, float, float }
+typedef struct {
+  float x,y,z;
+} A;
+typedef private A *PA;
+typedef global A *GA;
+
+void test(void) {
+  global int *glob;
+  local int *loc;
+  private int *priv;
+  generic int *gen;
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(glob);
+  
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(loc);
+ 
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(priv);
+ 
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to i32 addrspace(1)*
+  glob = to_global(gen);
+  
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(glob);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(loc);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(priv);
+
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(3)* @to_local(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(3)* %[[RET]] to i32 addrspace(3)*
+  loc = to_local(gen);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(1)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(glob);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32 addrspace(3)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(loc);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast i32* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(priv);
+
+  //CHECK: %[[ARG:.*]] = bitcast i32 addrspace(4)* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8* @to_private(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8* %[[RET]] to i32*
+  priv = to_private(gen);
+
+  //CHECK: %[[ARG:.*]] = addrspacecast %[[A]]* %{{.*}} to i8 addrspace(4)*
+  //CHECK: %[[RET:.*]] = call i8 addrspace(1)* @to_global(i8 addrspace(4)* %[[ARG]])
+  //CHECK: %{{.*}} = bitcast i8 addrspace(1)* %[[RET]] to %[[A]] addrspace(1)*
+  PA pA;
+  GA gA = to_global(pA);
+
+  //CHECK-NOT: addrspacecast
+  //CHECK-NOT: bitcast
+  //CHECK: call i8 addrspace(1)* @to_global(i8 addrspace(4)* %{{.*}})
+  //CHECK-NOT: addrspacecast
+  //CHECK-NOT: bitcast
+  generic void *gen_v;
+  global void *glob_v = to_global(gen_v);
+}
diff --git a/test/CodeGenOpenCL/unroll-hint.cl b/test/CodeGenOpenCL/unroll-hint.cl
new file mode 100644
index 0000000000000..a86762e02b548
--- /dev/null
+++ b/test/CodeGenOpenCL/unroll-hint.cl
@@ -0,0 +1,96 @@
+// RUN: %clang_cc1 -emit-llvm -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+/*** for ***/
+void for_count()
+{
+// CHECK-LABEL: for_count
+    __attribute__((opencl_unroll_hint(8)))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_COUNT:.*]]
+}
+
+void for_disable()
+{
+// CHECK-LABEL: for_disable
+    __attribute__((opencl_unroll_hint(1)))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_DISABLE:.*]]
+}
+
+void for_full()
+{
+// CHECK-LABEL: for_full
+    __attribute__((opencl_unroll_hint))
+    for( int i = 0; i < 1000; ++i);
+// CHECK: br label %{{.*}}, !llvm.loop ![[FOR_FULL:.*]]
+}
+
+/*** while ***/
+void while_count()
+{
+// CHECK-LABEL: while_count
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(8)))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_COUNT:.*]]
+}
+
+void while_disable()
+{
+// CHECK-LABEL: while_disable
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(1)))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_DISABLE:.*]]
+}
+
+void while_full()
+{
+// CHECK-LABEL: while_full
+    int i = 1000;
+    __attribute__((opencl_unroll_hint))
+    while(i-->0);
+// CHECK: br label %{{.*}}, !llvm.loop ![[WHILE_FULL:.*]]
+}
+
+/*** do ***/
+void do_count()
+{
+// CHECK-LABEL: do_count
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(8)))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_COUNT:.*]]
+}
+
+void do_disable()
+{
+// CHECK-LABEL: do_disable
+    int i = 1000;
+    __attribute__((opencl_unroll_hint(1)))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_DISABLE:.*]]
+}
+
+void do_full()
+{
+// CHECK-LABEL: do_full
+    int i = 1000;
+    __attribute__((opencl_unroll_hint))
+    do {} while(i--> 0);
+// CHECK: br i1 %{{.*}}, label %{{.*}}, label %{{.*}}, !llvm.loop ![[DO_FULL:.*]]
+}
+
+
+// CHECK: ![[FOR_COUNT]]     =  distinct !{![[FOR_COUNT]],  ![[COUNT:.*]]}
+// CHECK: ![[COUNT]]         =  !{!"llvm.loop.unroll.count", i32 8}
+// CHECK: ![[FOR_DISABLE]]   =  distinct !{![[FOR_DISABLE]],  ![[DISABLE:.*]]}
+// CHECK: ![[DISABLE]]       =  !{!"llvm.loop.unroll.disable"}
+// CHECK: ![[FOR_FULL]]      =  distinct !{![[FOR_FULL]],  ![[FULL:.*]]}
+// CHECK: ![[FULL]]          =  !{!"llvm.loop.unroll.full"}
+// CHECK: ![[WHILE_COUNT]]   =  distinct !{![[WHILE_COUNT]],    ![[COUNT]]}
+// CHECK: ![[WHILE_DISABLE]] =  distinct !{![[WHILE_DISABLE]],  ![[DISABLE]]}
+// CHECK: ![[WHILE_FULL]]    =  distinct !{![[WHILE_FULL]],     ![[FULL]]}
+// CHECK: ![[DO_COUNT]]      =  distinct !{![[DO_COUNT]],       ![[COUNT]]}
+// CHECK: ![[DO_DISABLE]]    =  distinct !{![[DO_DISABLE]],     ![[DISABLE]]}
+// CHECK: ![[DO_FULL]]       =  distinct !{![[DO_FULL]],        ![[FULL]]}
diff --git a/test/CodeGenOpenCL/vla.cl b/test/CodeGenOpenCL/vla.cl
new file mode 100644
index 0000000000000..cbf98445ce000
--- /dev/null
+++ b/test/CodeGenOpenCL/vla.cl
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -emit-llvm -triple "spir-unknown-unknown" -O0 -cl-std=CL2.0 -o - %s | FileCheck %s
+
+constant int sz0 = 5;
+// CHECK: @sz0 = addrspace(2) constant i32 5
+const global int sz1 = 16;
+// CHECK: @sz1 = addrspace(1) constant i32 16
+const constant int sz2 = 8;
+// CHECK: @sz2 = addrspace(2) constant i32 8
+// CHECK: @testvla.vla2 = internal addrspace(3) global [8 x i16] undef
+
+kernel void testvla()
+{
+  int vla0[sz0];
+// CHECK: %vla0 = alloca [5 x i32]
+  char vla1[sz1];
+// CHECK: %vla1 = alloca [16 x i8]
+  local short vla2[sz2];
+}
diff --git a/test/CoverageMapping/Inputs/ends_a_scope_only b/test/CoverageMapping/Inputs/ends_a_scope_only
new file mode 100644
index 0000000000000..5c34318c2147f
--- /dev/null
+++ b/test/CoverageMapping/Inputs/ends_a_scope_only
@@ -0,0 +1 @@
+}
diff --git a/test/CoverageMapping/Inputs/macros.h b/test/CoverageMapping/Inputs/macros.h
new file mode 100644
index 0000000000000..0a2dadd1b81a3
--- /dev/null
+++ b/test/CoverageMapping/Inputs/macros.h
@@ -0,0 +1,13 @@
+// Assorted macros to help test #include behavior across file boundaries.
+
+#define helper1 0
+
+void helper2(const char *, ...);
+
+#define M1(a, ...) helper2(a, ##__VA_ARGS__);
+
+// Note: M2 stresses vararg macro functions with macro arguments. The spelling
+// locations of the args used to be set to the expansion site, leading to
+// crashes (region LineEnd < LineStart). The regression test requires M2's line
+// number to be greater than the line number containing the expansion.
+#define M2(a, ...) M1(a, helper1, ##__VA_ARGS__);
diff --git a/test/CoverageMapping/Inputs/starts_a_scope_only b/test/CoverageMapping/Inputs/starts_a_scope_only
new file mode 100644
index 0000000000000..98232c64fce93
--- /dev/null
+++ b/test/CoverageMapping/Inputs/starts_a_scope_only
@@ -0,0 +1 @@
+{
diff --git a/test/CoverageMapping/block-storage-starts-region.m b/test/CoverageMapping/block-storage-starts-region.m
index 7997c8d4d2972..7e25438fc0e09 100644
--- a/test/CoverageMapping/block-storage-starts-region.m
+++ b/test/CoverageMapping/block-storage-starts-region.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
 
 @interface Foo
 @end
diff --git a/test/CoverageMapping/break.c b/test/CoverageMapping/break.c
index 99439c8bc69d7..ee41271b53fbd 100644
--- a/test/CoverageMapping/break.c
+++ b/test/CoverageMapping/break.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name break.c %s | FileCheck %s
 
 int main() {         // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0
   int cnt = 0;       // CHECK-NEXT: File 0, [[@LINE+1]]:9 -> [[@LINE+1]]:18 = #0
diff --git a/test/CoverageMapping/builtinmacro.c b/test/CoverageMapping/builtinmacro.c
index 80b2672a67365..63f5584d40c2c 100644
--- a/test/CoverageMapping/builtinmacro.c
+++ b/test/CoverageMapping/builtinmacro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name builtinmacro.c %s | FileCheck %s
 
 // Test the coverage mapping generation for built-in macroes.
 
diff --git a/test/CoverageMapping/casts.c b/test/CoverageMapping/casts.c
index 95289f6a184b5..d295f3159875a 100644
--- a/test/CoverageMapping/casts.c
+++ b/test/CoverageMapping/casts.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name casts.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name casts.c %s | FileCheck %s
 
 int main() {                                                   // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+4]]:2 = #0
                                                                // CHECK-NEXT: File 0, [[@LINE+1]]:41 -> [[@LINE+1]]:54 = #1
diff --git a/test/CoverageMapping/classtemplate.cpp b/test/CoverageMapping/classtemplate.cpp
index 2e0b50772da7c..0ccdcb2431b5d 100644
--- a/test/CoverageMapping/classtemplate.cpp
+++ b/test/CoverageMapping/classtemplate.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name classtemplate.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-CONSTRUCTOR
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-GETTER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-SETTER
diff --git a/test/CoverageMapping/comment-in-macro.c b/test/CoverageMapping/comment-in-macro.c
index ecc883f68ec67..06e8adbc41ee2 100644
--- a/test/CoverageMapping/comment-in-macro.c
+++ b/test/CoverageMapping/comment-in-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define x1 "" // ...
 #define x2 return 0
diff --git a/test/CoverageMapping/continue.c b/test/CoverageMapping/continue.c
index c86651e221d36..7ea03fb68624d 100644
--- a/test/CoverageMapping/continue.c
+++ b/test/CoverageMapping/continue.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name continue.c %s | FileCheck %s
 
 int main() {                    // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+21]]:2 = #0
   int j = 0;                    // CHECK-NEXT: File 0, [[@LINE+2]]:18 -> [[@LINE+2]]:24 = (#0 + #1)
diff --git a/test/CoverageMapping/control-flow-macro.c b/test/CoverageMapping/control-flow-macro.c
index 149cb5572cb08..8508e53f333f8 100644
--- a/test/CoverageMapping/control-flow-macro.c
+++ b/test/CoverageMapping/control-flow-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define ifc if
 
diff --git a/test/CoverageMapping/decl.c b/test/CoverageMapping/decl.c
index 96ee30357a843..e4770288d987c 100644
--- a/test/CoverageMapping/decl.c
+++ b/test/CoverageMapping/decl.c
@@ -1,6 +1,6 @@
 // Ensure that declarations without definitions don't have maps emitted for them
 
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s > %t
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s > %t
 // FileCheck -input-file %t %s
 // RUN: FileCheck -check-prefix BAR -input-file %t %s
 
diff --git a/test/CoverageMapping/header.cpp b/test/CoverageMapping/header.cpp
index e495d5a6285cb..5e0b3111c1d37 100644
--- a/test/CoverageMapping/header.cpp
+++ b/test/CoverageMapping/header.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name header.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name header.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-FUNC
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-STATIC-FUNC
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-STATIC-FUNC2
diff --git a/test/CoverageMapping/if.c b/test/CoverageMapping/if.c
index 73b230871675c..69544f68cfe23 100644
--- a/test/CoverageMapping/if.c
+++ b/test/CoverageMapping/if.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name if.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name if.c %s | FileCheck %s
 
 int main() {                    // CHECK: File 0, [[@LINE]]:12 -> {{[0-9]+}}:2 = #0
   int i = 0;
diff --git a/test/CoverageMapping/implicit-def-in-macro.m b/test/CoverageMapping/implicit-def-in-macro.m
index 7e563acc54d13..902fc8bd6a774 100644
--- a/test/CoverageMapping/implicit-def-in-macro.m
+++ b/test/CoverageMapping/implicit-def-in-macro.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -triple x86_64-apple-darwin -fobjc-runtime=macosx-10.10.0 -fblocks -fobjc-arc %s | FileCheck %s
 
 @interface Foo
 @end
diff --git a/test/CoverageMapping/include-macros.c b/test/CoverageMapping/include-macros.c
new file mode 100644
index 0000000000000..113721c21a5bd
--- /dev/null
+++ b/test/CoverageMapping/include-macros.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name include-macros.c %s | FileCheck %s
+
+#include "Inputs/macros.h"
+
+void f1() {
+  M2("a", "b");
+}
+
+// CHECK-LABEL: f1:
+// CHECK-NEXT:   File 0, 5:11 -> 7:2 = #0
+// CHECK-NEXT:   Expansion,File 0, 6:3 -> 6:5 = #0 (Expanded file = 1)
+// CHECK-NEXT:   File 1, 13:20 -> 13:50 = #0
+// CHECK-NEXT:   Expansion,File 1, 13:20 -> 13:22 = #0 (Expanded file = 2)
+// CHECK-NEXT:   File 2, 7:20 -> 7:46 = #0
+// CHECK-NEXT:   Expansion,File 2, 7:33 -> 7:44 = #0 (Expanded file = 3)
+// CHECK-NEXT:   File 3, 13:26 -> 13:34 = #0
+// CHECK-NEXT:   Expansion,File 3, 13:26 -> 13:33 = #0 (Expanded file = 4)
+// CHECK-NEXT:   File 4, 3:17 -> 3:18 = #0
diff --git a/test/CoverageMapping/includehell.cpp b/test/CoverageMapping/includehell.cpp
index 5a9ff78386d69..9ad3683abe112 100644
--- a/test/CoverageMapping/includehell.cpp
+++ b/test/CoverageMapping/includehell.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name includehell.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name includehell.cpp %s > %tmapping
 
 int main() {
   int x = 0;
diff --git a/test/CoverageMapping/ir.c b/test/CoverageMapping/ir.c
index f94d34c0b98c8..469b2992ecd7b 100644
--- a/test/CoverageMapping/ir.c
+++ b/test/CoverageMapping/ir.c
@@ -1,5 +1,5 @@
 // Check the data structures emitted by coverage mapping
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name ir.c %s -o - -emit-llvm -fprofile-instr-generate -fcoverage-mapping | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name ir.c %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s
 
 
 void foo(void) { }
@@ -9,4 +9,4 @@ int main(void) {
   return 0;
 }
 
-// CHECK: @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x <{ i8*, i32, i32, i64 }>], [{{[0-9]+}} x i8] } { { i32, i32, i32, i32 } { i32 2, i32 {{[0-9]+}}, i32 {{[0-9]+}}, i32 0 }, [2 x <{ i8*, i32, i32, i64 }>] [<{ i8*, i32, i32, i64 }> <{ i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i32 3, i32 9, i64 {{[0-9]+}} }>, <{ i8*, i32, i32, i64 }> <{ i8* getelementptr inbounds ([4 x i8], [4 x i8]* @__profn_main, i32 0, i32 0), i32 4, i32 9, i64 {{[0-9]+}} }>]
+// CHECK: @__llvm_coverage_mapping = internal constant { { i32, i32, i32, i32 }, [2 x <{ i64, i32, i64 }>], [{{[0-9]+}} x i8] } { { i32, i32, i32, i32 } { i32 2, i32 {{[0-9]+}}, i32 {{[0-9]+}}, i32 {{[0-9]+}} }, [2 x <{ i64, i32, i64 }>] [<{{.*}}> <{{.*}}>, <{{.*}}> <{{.*}}>]
diff --git a/test/CoverageMapping/label.cpp b/test/CoverageMapping/label.cpp
index 52618f7c2e9ef..1c5111a675a18 100644
--- a/test/CoverageMapping/label.cpp
+++ b/test/CoverageMapping/label.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name label.cpp %s | FileCheck %s
 
                              // CHECK: func
 void func() {                // CHECK-NEXT: File 0, [[@LINE]]:13 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/lambda.cpp b/test/CoverageMapping/lambda.cpp
index fb018e6b477c5..4f23c157e3859 100644
--- a/test/CoverageMapping/lambda.cpp
+++ b/test/CoverageMapping/lambda.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x c++ -std=c++11 -triple %itanium_abi_triple -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s -main-file-name lambda.cpp | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -triple %itanium_abi_triple -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s -main-file-name lambda.cpp | FileCheck %s
 
 // CHECK-LABEL: _Z3fooi:
 void foo(int i) { // CHECK: File 0, [[@LINE]]:17 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/logical.cpp b/test/CoverageMapping/logical.cpp
index ece3102fc22e0..198cc60d99b6f 100644
--- a/test/CoverageMapping/logical.cpp
+++ b/test/CoverageMapping/logical.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name logical.cpp %s | FileCheck %s
 
 int main() {                        // CHECK: File 0, [[@LINE]]:12 -> [[@LINE+10]]:2 = #0
   bool bt = true;
diff --git a/test/CoverageMapping/loopmacro.c b/test/CoverageMapping/loopmacro.c
index bbd0c45c11f14..cffeca087c536 100644
--- a/test/CoverageMapping/loopmacro.c
+++ b/test/CoverageMapping/loopmacro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loopmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loopmacro.c %s | FileCheck %s
 
 // CHECK: main
 // CHECK-NEXT: File 0, {{[0-9]+}}:12 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/loops.cpp b/test/CoverageMapping/loops.cpp
index 84a9892526ce4..cb7d777f86b69 100644
--- a/test/CoverageMapping/loops.cpp
+++ b/test/CoverageMapping/loops.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
 
                                     // CHECK: rangedFor
 void rangedFor() {                  // CHECK-NEXT: File 0, [[@LINE]]:18 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/macro-expansion.c b/test/CoverageMapping/macro-expansion.c
index e87f444832948..3fca97584aa89 100644
--- a/test/CoverageMapping/macro-expansion.c
+++ b/test/CoverageMapping/macro-expansion.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
 
 // CHECK: func
 // CHECK:      File 1, [[@LINE+5]]:12 -> [[@LINE+5]]:38 = #0
diff --git a/test/CoverageMapping/macro-expressions.cpp b/test/CoverageMapping/macro-expressions.cpp
index 1085ab02492fb..3852fc6a23b7d 100644
--- a/test/CoverageMapping/macro-expressions.cpp
+++ b/test/CoverageMapping/macro-expressions.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expressions.cpp %s | FileCheck %s
 
 #define EXPR(x) (x)
 #define NEXPR(x) (!x)
@@ -12,6 +12,44 @@
 #define PRIo64 PRI_64_LENGTH_MODIFIER "o"
 #define PRIu64 PRI_64_LENGTH_MODIFIER "u"
 
+#define STMT(s) s
+
+void fn1() {
+  STMT(if (1));
+  STMT(while (1));
+  STMT(for (;;));
+  STMT(if) (1);
+  STMT(while) (1);
+  STMT(for) (;;);
+  if (1)
+    STMT(if (1)
+        STMT(if (1)));
+  if (1)
+    STMT(if (1)) 0;
+  if (1)
+    STMT(while (1)) 0;
+  if (1)
+    STMT(for (;;)) 0;
+  while (1)
+    STMT(if (1)) 0;
+  while (1)
+    STMT(while (1)) 0;
+  while (1)
+    STMT(for (;;)) 0;
+  for (;;)
+    STMT(if (1)) 0;
+  for (;;)
+    STMT(while (1)) 0;
+  for (;;)
+    STMT(for (;;)) 0;
+}
+
+void STMT(fn2()) {
+}
+
+void STMT(fn3)() {
+}
+
 // CHECK: foo
 // CHECK-NEXT: File 0, [[@LINE+1]]:17 -> {{[0-9]+}}:2 = #0
 void foo(int i) {
diff --git a/test/CoverageMapping/macroception.c b/test/CoverageMapping/macroception.c
index bde38ff773217..7848741641873 100644
--- a/test/CoverageMapping/macroception.c
+++ b/test/CoverageMapping/macroception.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroception.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroception.c %s | FileCheck %s
 
 #define M2 {
 #define M1 M2
diff --git a/test/CoverageMapping/macroparams.c b/test/CoverageMapping/macroparams.c
index d2c8e55e6a7f7..efffc77a07bfb 100644
--- a/test/CoverageMapping/macroparams.c
+++ b/test/CoverageMapping/macroparams.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams.c %s | FileCheck %s
 
 // CHECK: main
 // CHECK-NEXT: File 0, {{[0-9]+}}:12 -> {{[0-9]+}}:2 = #0
diff --git a/test/CoverageMapping/macroparams2.c b/test/CoverageMapping/macroparams2.c
index fc156de75582d..4e04581b725e8 100644
--- a/test/CoverageMapping/macroparams2.c
+++ b/test/CoverageMapping/macroparams2.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroparams2.c %s | FileCheck %s
 
 #define MACRO(REFS, CALLS)  (4 * (CALLS) < (REFS))
 
diff --git a/test/CoverageMapping/macros.c b/test/CoverageMapping/macros.c
index 02ecceba63ff5..f6339614397b6 100644
--- a/test/CoverageMapping/macros.c
+++ b/test/CoverageMapping/macros.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macros.c %s | FileCheck %s
 
 #define MACRO return; bar()
 #define MACRO_2 bar()
diff --git a/test/CoverageMapping/macroscopes.cpp b/test/CoverageMapping/macroscopes.cpp
index a3a9297968788..f5fd55c731779 100644
--- a/test/CoverageMapping/macroscopes.cpp
+++ b/test/CoverageMapping/macroscopes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macroscopes.cpp %s | FileCheck %s
 
 #define starts_a_scope for (int i = 0; i < 2; ++i) {
 
@@ -22,6 +22,17 @@
 #define starts_a_while while (x < 5)
 #define simple_stmt ++x
 
+#define macro_with_for          \
+  x = 3;                        \
+  for (int i = 0; i < x; ++i) { \
+  }
+
+#define macro_with_while \
+  x = 4;                 \
+  while (x < 5) {        \
+    ++x;                 \
+  }
+
 // CHECK: main
 // CHECK-NEXT: File 0, [[@LINE+1]]:12 -> {{[0-9]+}}:2 = #0
 int main() {
@@ -64,6 +75,11 @@ int main() {
     simple_stmt;
   ends_a_scope
 
+  // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:17 = #0
+  macro_with_for
+  // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:19 = #0
+  macro_with_while
+
   return 0;
 }
 
@@ -103,3 +119,10 @@ int main() {
 // CHECK-NEXT: File 11, 22:31 -> 22:36 = (#0 + #9)
 // CHECK-NEXT: File 12, 23:21 -> 23:24 = #9
 // CHECK-NEXT: File 13, 6:3 -> 7:4 = #9
+// CHECK-NEXT: File 14, 26:3 -> 28:4 = #0
+// CHECK-NEXT: File 14, 27:19 -> 27:24 = (#0 + #10)
+// CHECK-NEXT: File 14, 27:26 -> 27:29 = #10
+// CHECK-NEXT: File 14, 27:31 -> 28:4 = #10
+// CHECK-NEXT: File 15, 31:3 -> 34:4 = #0
+// CHECK-NEXT: File 15, 32:10 -> 32:15 = (#0 + #11)
+// CHECK-NEXT: File 15, 32:17 -> 34:4 = #11
diff --git a/test/CoverageMapping/md.cpp b/test/CoverageMapping/md.cpp
index fff0df3526517..20c696c7dfbba 100644
--- a/test/CoverageMapping/md.cpp
+++ b/test/CoverageMapping/md.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++11 %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -std=c++11 %s | FileCheck %s
 
 #define BREAK break
 
diff --git a/test/CoverageMapping/moremacros.c b/test/CoverageMapping/moremacros.c
index d4a8f87a481af..56662270d073b 100644
--- a/test/CoverageMapping/moremacros.c
+++ b/test/CoverageMapping/moremacros.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name macro-expansion.c %s | FileCheck %s
 
 #define LBRAC {
 #define RBRAC }
diff --git a/test/CoverageMapping/nestedclass.cpp b/test/CoverageMapping/nestedclass.cpp
index be4e0ba0b261d..6cbddebe9fcd5 100644
--- a/test/CoverageMapping/nestedclass.cpp
+++ b/test/CoverageMapping/nestedclass.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name nestedclass.cpp %s > %tmapping
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name nestedclass.cpp %s > %tmapping
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-OUTER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-INNER
 // RUN: FileCheck -input-file %tmapping %s --check-prefix=CHECK-INNERMOST
diff --git a/test/CoverageMapping/objc.m b/test/CoverageMapping/objc.m
index 8456dc3842b21..89da5da3203ed 100644
--- a/test/CoverageMapping/objc.m
+++ b/test/CoverageMapping/objc.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name objc.m -triple x86_64-apple-darwin -fobjc-runtime=macosx-fragile-10.5 %s | FileCheck %s
 
 @interface A
 - (void)bork:(int)msg;
diff --git a/test/CoverageMapping/preprocessor.c b/test/CoverageMapping/preprocessor.c
index cdd448cd2090a..bd82b3939ed50 100644
--- a/test/CoverageMapping/preprocessor.c
+++ b/test/CoverageMapping/preprocessor.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name preprocessor.c %s | FileCheck %s
 
                  // CHECK: func
 void func() {    // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+5]]:2 = #0
diff --git a/test/CoverageMapping/return.c b/test/CoverageMapping/return.c
index ab63c2c28637a..1b190b0eb7333 100644
--- a/test/CoverageMapping/return.c
+++ b/test/CoverageMapping/return.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name return.c %s | FileCheck %s
 
                                 // CHECK: func
 void func() {                   // CHECK: File 0, [[@LINE]]:13 -> [[@LINE+3]]:2 = #0
diff --git a/test/CoverageMapping/switch.c b/test/CoverageMapping/switch.c
index 3c0b0323f69ed..6aa2b31426f1e 100644
--- a/test/CoverageMapping/switch.c
+++ b/test/CoverageMapping/switch.c
@@ -1,44 +1,44 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switch.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switch.c %s | FileCheck %s
                     // CHECK: foo
 void foo(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+8]]:2 = #0
-  switch(i) {
+  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+5]]:4 = #1
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+3]]:10 = #2
     return;
   case 2:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #3
     break;
   }
-  int x = 0;        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:2 = #1
+  int x = 0;
 }
 
 void nop() {}
 
                     // CHECK: bar
 void bar(int i) {   // CHECK-NEXT: File 0, [[@LINE]]:17 -> [[@LINE+20]]:2 = #0
-  switch (i)
+  switch (i)        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:6 = #1
     ;               // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:6 = 0
 
-  switch (i) {      // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+16]]:2 = #1
+  switch (i) {      // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:4 = #2
   }
 
-  switch (i)        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+13]]:2 = #2
+  switch (i)        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #3
     nop();          // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE]]:10 = 0
 
-  switch (i)        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+10]]:2 = #3
+  switch (i)        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = #4
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #5
     nop();
 
-  switch (i) {      // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+6]]:2 = #4
+  switch (i) {      // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+4]]:4 = #6
     nop();          // CHECK-NEXT: File 0, [[@LINE]]:5 -> [[@LINE+2]]:10 = 0
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #7
     nop();
   }
-  nop();            // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:2 = #6
+  nop();
 }
 
                     // CHECK-NEXT: main
 int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+34]]:2 = #0
   int i = 0;
-  switch(i) {
+  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+9]]:4 = #1
   case 0:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+7]]:10 = #2
     i = 1;
     break;
@@ -48,7 +48,7 @@ int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+34]]:2 = #0
   default:          // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:10 = #4
     break;
   }
-  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+22]]:2 = #1
+  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+8]]:4 = #5
   case 0:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+6]]:10 = #6
     i = 1;
     break;
@@ -58,7 +58,7 @@ int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+34]]:2 = #0
     break;
   }
 
-  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+12]]:2 = #5
+  switch(i) {       // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+7]]:4 = #9
   case 1:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+5]]:11 = #10
   case 2:           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+4]]:11 = (#10 + #11)
     i = 11;
@@ -67,7 +67,7 @@ int main() {        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+34]]:2 = #0
     i = 99;
   }
 
-  foo(1);           // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+2]]:11 = #9
+  foo(1);
   bar(1);
   return 0;
 }
diff --git a/test/CoverageMapping/switchmacro.c b/test/CoverageMapping/switchmacro.c
index 431d5c72ea52e..f83d26fd1688a 100644
--- a/test/CoverageMapping/switchmacro.c
+++ b/test/CoverageMapping/switchmacro.c
@@ -1,14 +1,14 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name switchmacro.c %s | FileCheck %s
 
 #define FOO(x) (void)x
 
 // CHECK: foo
 int foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:16 -> {{[0-9]+}}:2 = #0
-  switch (i) {
+  switch (i) {   // CHECK-NEXT: File 0, [[@LINE]]:3 -> {{[0-9]+}}:4 = #1
   default:       // CHECK-NEXT: File 0, [[@LINE]]:3 -> {{[0-9]+}}:11 = #2
     if (i == 1)  // CHECK-NEXT: File 0, [[@LINE]]:9 -> [[@LINE]]:15 = #2
       return 0;  // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE]]:15 = #3
-    // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:5 -> [[@LINE+2]]:8 = (#2 - #3)
+    // CHECK-NEXT: Expansion,File 0, [[@LINE+2]]:5 -> [[@LINE+2]]:8 = (#2 - #3) (Expanded file = 1)
     // CHECK-NEXT: File 0, [[@LINE+1]]:8 -> {{[0-9]+}}:11 = (#2 - #3)
     FOO(1);
   case 0:        // CHECK-NEXT: File 0, [[@LINE]]:3 -> [[@LINE+1]]:13 = ((#2 + #4) - #3)
@@ -22,6 +22,24 @@ int foo(int i) { // CHECK-NEXT: File 0, [[@LINE]]:16 -> {{[0-9]+}}:2 = #0
   }
 }
 
+// PR26825 - Crash when exiting macro expansion containing a switch
+// CHECK: bar
+#define START { while (0) { switch (0) {
+#define END   }}}
+void bar() {
+  START      // CHECK: File 0, [[@LINE]]:8 -> [[@LINE+2]]:6
+default: ;
+  END
+}
+
+// PR27948 - Crash when handling a switch partially covered by a macro
+// CHECK: baz
+#define START2 switch (0) default:
+void baz() {
+  for (;;)
+    START2 return; // CHECK: Expansion,File 0, [[@LINE]]:5 -> [[@LINE]]:11 = #1 (Expanded file = 1)
+}
+
 int main(int argc, const char *argv[]) {
   foo(3);
   return 0;
diff --git a/test/CoverageMapping/system_macro.c b/test/CoverageMapping/system_macro.c
index b0ce360005a86..bddc822b0d90b 100644
--- a/test/CoverageMapping/system_macro.c
+++ b/test/CoverageMapping/system_macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name system_macro.c -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name system_macro.c -o - %s | FileCheck %s
 
 #ifdef IS_SYSHEADER
 
@@ -13,9 +13,8 @@
 
 // CHECK-LABEL: doSomething:
 void doSomething(int x) { // CHECK: File 0, [[@LINE]]:25 -> {{[0-9:]+}} = #0
-  Func(x); // CHECK: Expansion,File 0, [[@LINE]]:3 -> [[@LINE]]:7
+  Func(x);
   return;
-  // CHECK: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:11
   SomeType *f; // CHECK: File 0, [[@LINE]]:11 -> {{[0-9:]+}} = 0
 }
 
diff --git a/test/CoverageMapping/templates.cpp b/test/CoverageMapping/templates.cpp
index fcb92e1e9f570..bdba1d41f0b34 100644
--- a/test/CoverageMapping/templates.cpp
+++ b/test/CoverageMapping/templates.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name templates.cpp %s | FileCheck %s
 
 template<typename T>
 void unused(T x) {
diff --git a/test/CoverageMapping/test.c b/test/CoverageMapping/test.c
index a274ce432bd09..5affbaadfd1d5 100644
--- a/test/CoverageMapping/test.c
+++ b/test/CoverageMapping/test.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name test.c %s | FileCheck %s
 
 void bar();
 static void static_func();
diff --git a/test/CoverageMapping/trycatch.cpp b/test/CoverageMapping/trycatch.cpp
index 2d0f629952dbd..01d8fb9307401 100644
--- a/test/CoverageMapping/trycatch.cpp
+++ b/test/CoverageMapping/trycatch.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trycatch.cpp %s | FileCheck %s
 
 class Error {
 };
@@ -23,7 +23,7 @@ void func(int i) {                    // CHECK-NEXT: File 0, [[@LINE]]:18 -> {{[
                                       // CHECK-NEXT: main
 int main() {                          // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+13]]:2 = #0
   int j = 1;
-  try {
+  try {                               // CHECK-NEXT: File 0, [[@LINE]]:7 -> [[@LINE+2]]:4 = #0
     func(j);
   } catch(const Error &e) {           // CHECK-NEXT: File 0, [[@LINE]]:27 -> [[@LINE+2]]:4 = #2
     j = 1;
diff --git a/test/CoverageMapping/trymacro.cpp b/test/CoverageMapping/trymacro.cpp
index 949186d961599..32f44381b9388 100644
--- a/test/CoverageMapping/trymacro.cpp
+++ b/test/CoverageMapping/trymacro.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trymacro.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++11 -fexceptions -fcxx-exceptions -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name trymacro.cpp %s | FileCheck %s
 
 // CHECK: Z3fn1v:
 void fn1() try { return; } // CHECK: [[@LINE]]:12 -> [[@LINE+1]]:14 = #1
@@ -17,8 +17,27 @@ catch(...) {}               // CHECK: [[@LINE]]:12 -> [[@LINE]]:14 = #2
 void fn3() TRY { return; } // CHECK: [[@LINE]]:15 -> [[@LINE+1]]:14 = #1
 CATCH(...) {}              // CHECK: [[@LINE]]:12 -> [[@LINE]]:14 = #2
 
+// CHECK: Z3fn4v:
+#define TRY2 try { // CHECK-DAG: File 1, [[@LINE]]:18 -> [[@LINE]]:19 = #1
+void fn4() TRY2 // CHECK-DAG: Expansion,File 0, [[@LINE]]:12 -> [[@LINE]]:16 = #1 (Expanded file = 1)
+  for (;;)
+    return;
+}
+catch (...) {}
+
+// CHECK: Z3fn5v:
+#define TRY3 try { return; } catch (...) // CHECK-DAG: File 2, [[@LINE]]:18 -> [[@LINE]]:29 = #1
+#define TRY4 try { TRY3 { return; } } catch (...) // CHECK-DAG: Expansion,File 1, [[@LINE]]:20 -> [[@LINE]]:24 = #1 (Expanded file = 2)
+void fn5() {
+  for (;;) {
+    TRY4 { return; } // CHECK-DAG: Expansion,File 0, [[@LINE]]:5 -> [[@LINE]]:9 = #1 (Expanded file = 1)
+  }                  // CHECK-DAG: File 0, [[@LINE-1]]:10 -> [[@LINE-1]]:21 = #5
+}
+
 int main() {
   fn1();
   fn2();
   fn3();
+  fn4();
+  fn5();
 }
diff --git a/test/CoverageMapping/unreachable-macro.c b/test/CoverageMapping/unreachable-macro.c
index 4b33a23e7bdc4..b9d4f3616ffa5 100644
--- a/test/CoverageMapping/unreachable-macro.c
+++ b/test/CoverageMapping/unreachable-macro.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
 
 #define WHILE while (0) {}
 
diff --git a/test/CoverageMapping/unused_function.cpp b/test/CoverageMapping/unused_function.cpp
new file mode 100644
index 0000000000000..6a46b1d2af1f9
--- /dev/null
+++ b/test/CoverageMapping/unused_function.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only %s | FileCheck %s
+
+#define START_SCOPE {
+#define END_SCOPE }
+
+// CHECK: {{_Z2f0v|\?f0@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:20 = 0
+inline void f0() {}
+
+// CHECK: {{_Z2f1v|\?f1@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:31 = 0
+inline void f1() START_SCOPE }
+
+// CHECK: {{_Z2f2v|\?f2@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:29 = 0
+inline void f2() { END_SCOPE
+
+// CHECK: {{_Z2f3v|\?f3@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+1]]:39 = 0
+inline void f3() START_SCOPE END_SCOPE
+
+// CHECK: {{_Z2f4v|\?f4@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+2]]:10 -> [[@LINE+3]]:2 = 0
+inline void f4()
+#include "Inputs/starts_a_scope_only"
+}
+
+// CHECK: {{_Z2f5v|\?f5@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+1]]:18 -> [[@LINE+2]]:36 = 0
+inline void f5() {
+#include "Inputs/ends_a_scope_only"
+
+// CHECK: {{_Z2f6v|\?f6@@YAXXZ}}:
+// CHECK-NEXT: File 0, [[@LINE+2]]:10 -> [[@LINE+3]]:36 = 0
+inline void f6()
+#include "Inputs/starts_a_scope_only"
+#include "Inputs/ends_a_scope_only"
diff --git a/test/CoverageMapping/unused_names.c b/test/CoverageMapping/unused_names.c
index 00941b8631586..a03d18b6655d9 100644
--- a/test/CoverageMapping/unused_names.c
+++ b/test/CoverageMapping/unused_names.c
@@ -1,12 +1,13 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -emit-llvm -main-file-name unused_names.c -o - %s > %t
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -emit-llvm -main-file-name unused_names.c -o - %s > %t
 // RUN: FileCheck -input-file %t %s
 // RUN: FileCheck -check-prefix=SYSHEADER -input-file %t %s
 
 // Since foo is never emitted, there should not be a profile name for it.
 
-// CHECK-DAG: @__profn_bar = {{.*}} [3 x i8] c"bar", section "{{.*}}__llvm_prf_names"
-// CHECK-DAG: @__profn_baz = {{.*}} [3 x i8] c"baz", section "{{.*}}__llvm_prf_names"
-// CHECK-DAG: @__profn_unused_names.c_qux = {{.*}} [18 x i8] c"unused_names.c:qux", section "{{.*}}__llvm_prf_names"
+// CHECK-DAG: @__profn_bar = {{.*}} [3 x i8] c"bar"
+// CHECK-DAG: @__profn_baz = {{.*}} [3 x i8] c"baz"
+// CHECK-DAG: @__profn_unused_names.c_qux = {{.*}} [18 x i8] c"unused_names.c:qux"
+// CHECK-DAG: @__llvm_prf_nm = private constant {{.*}}, section "{{.*}}__llvm_prf_names"
 
 // SYSHEADER-NOT: @__profn_foo =
 
diff --git a/test/CoverageMapping/while.c b/test/CoverageMapping/while.c
index a85957ff8ac5d..7f09e4b0d7271 100644
--- a/test/CoverageMapping/while.c
+++ b/test/CoverageMapping/while.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fprofile-instr-generate -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
+// RUN: %clang_cc1 -fprofile-instrument=clang -fcoverage-mapping -dump-coverage-mapping -emit-llvm-only -main-file-name loops.cpp %s | FileCheck %s
 
                                     // CHECK: main
 int main() {                        // CHECK-NEXT: File 0, [[@LINE]]:12 -> [[@LINE+8]]:2 = #0
diff --git a/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep b/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA/usr/local/cuda/bin/.keep
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/bin/.keep b/test/Driver/Inputs/CUDA_80/usr/local/cuda/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/bin/.keep
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/include/.keep b/test/Driver/Inputs/CUDA_80/usr/local/cuda/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/include/.keep
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib/.keep b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib/.keep
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib64/.keep b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/lib64/.keep
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_20.10.bc
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/nvvm/libdevice/libdevice.compute_35.10.bc
diff --git a/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt b/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt
new file mode 100644
index 0000000000000..ee238af951a03
--- /dev/null
+++ b/test/Driver/Inputs/CUDA_80/usr/local/cuda/version.txt
@@ -0,0 +1 @@
+CUDA Version 8.0.42
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/lib/.keep b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/aarch64-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/armv7-a/thumb/.keep b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/armv7-a/thumb/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/armv7-a/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/thumb/.keep b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/thumb/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/arm-linux-androideabi/lib/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/aarch64-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/arm-linux-androideabi-ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/i686-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/bin/mipsel-linux-android-ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/lib/.keep b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/i686-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/aarch64-linux-android/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/aarch64-linux-android/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/aarch64-linux-android/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/thumb/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/thumb/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/arm-linux-androideabi/thumb/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/backward/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/backward/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/backward/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/i686-linux-android/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/i686-linux-android/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/i686-linux-android/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r2/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r2/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r2/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r6/.keep b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r6/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/include/c++/4.9/mipsel-linux-android/mips-r6/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/include/.keep b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/aarch64-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/include/.keep b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/arm-linux-androideabi/4.9/thumb/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/include/.keep b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/i686-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/include/.keep b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/include/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r2/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtbegin.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtbegin.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtend.o b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/lib/gcc/mipsel-linux-android/4.9/mips-r6/crtend.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.bfd b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.bfd
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.bfd
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.gold b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.gold
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/bin/ld.gold
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/lib/.keep b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/mipsel-linux-android/lib/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/include/.keep b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/include/.keep
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_dynamic.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_dynamic.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_dynamic.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_so.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_so.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_so.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_static.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_static.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtbegin_static.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_android.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_android.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_android.o
diff --git a/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_so.o b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_so.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/basic_android_ndk_tree/sysroot/usr/lib/crtend_so.o
diff --git a/test/Driver/Inputs/gcc_version_parsing5/bin/.keep b/test/Driver/Inputs/gcc_version_parsing5/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gcc_version_parsing5/bin/.keep
diff --git a/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/4.9.2/crtbegin.o b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/4.9.2/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/4.9.2/crtbegin.o
diff --git a/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/5/crtbegin.o b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/5/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gcc_version_parsing5/lib/gcc/i386-unknown-linux/5/crtbegin.o
diff --git a/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/include/.keep b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/include/.keep
diff --git a/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/crtbegin.o b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/crtbegin.o
diff --git a/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/.keep b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/.keep
diff --git a/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/x86_64-pc-linux-gnu/lib/.keep b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/x86_64-pc-linux-gnu/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/gentoo_linux_gcc_4.9.3_tree/usr/x86_64-pc-linux-gnu/lib/.keep
diff --git a/test/Driver/Inputs/header0.h b/test/Driver/Inputs/header0.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/header0.h
diff --git a/test/Driver/Inputs/header1.h b/test/Driver/Inputs/header1.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/header1.h
diff --git a/test/Driver/Inputs/header2.h b/test/Driver/Inputs/header2.h
new file mode 100644
index 0000000000000..243468d879c7d
--- /dev/null
+++ b/test/Driver/Inputs/header2.h
@@ -0,0 +1 @@
+#include "header1.h"
diff --git a/test/Driver/Inputs/header3.h b/test/Driver/Inputs/header3.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/header3.h
diff --git a/test/Driver/Inputs/header4.h b/test/Driver/Inputs/header4.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/header4.h
diff --git a/test/Driver/Inputs/mips_img_v2_tree/bin/.keep b/test/Driver/Inputs/mips_img_v2_tree/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/bin/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromips-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/micromipsel-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mips-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtend.o b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/lib/gcc/mips-img-linux-gnu/4.9.2/mipsel-r6-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/bin/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/bin/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/mips-img-linux-gnu/lib/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromips-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/micromipsel-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mips-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib32/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib64/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/lib/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/include/.keep b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_img_v2_tree/sysroot/mipsel-r6-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/bin/.keep b/test/Driver/Inputs/mips_mti_tree/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/bin/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/include/.keep b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/micromipsel-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mips-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-nan2008/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard-uclibc/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib32/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-hard/lib64/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtbegin.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtbegin.o
diff --git a/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtend.o b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/lib/gcc/mips-mti-linux-gnu/4.9.2/mipsel-r2-soft/lib/crtend.o
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/bin/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/bin/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/bin/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/include/c++/4.9.2/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/mips-mti-linux-gnu/lib/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/micromipsel-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mips-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-nan2008/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard-uclibc/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib32/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib32/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib32/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib64/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib64/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/lib64/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib32/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-hard/usr/lib64/crtn.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/include/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/include/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/include/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/.keep b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/.keep
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/.keep
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crt1.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crt1.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crt1.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crti.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crti.o
diff --git a/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crtn.o b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/mips_mti_tree/sysroot/mipsel-r2-soft/usr/lib/crtn.o
diff --git a/test/Driver/Inputs/pchfile.cpp b/test/Driver/Inputs/pchfile.cpp
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/pchfile.cpp
diff --git a/test/Driver/Inputs/pchfile.h b/test/Driver/Inputs/pchfile.h
new file mode 100644
index 0000000000000..1aafaeebe8102
--- /dev/null
+++ b/test/Driver/Inputs/pchfile.h
@@ -0,0 +1,3 @@
+#if defined(ERR_HEADER)
+#error nope1
+#endif
diff --git a/test/Driver/Inputs/resource_dir/vtables_blacklist.txt b/test/Driver/Inputs/resource_dir/vtables_blacklist.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Driver/Inputs/resource_dir/vtables_blacklist.txt
diff --git a/test/Driver/aarch64-cpus.c b/test/Driver/aarch64-cpus.c
index 7b0fac442472a..3a9116dde84a8 100644
--- a/test/Driver/aarch64-cpus.c
+++ b/test/Driver/aarch64-cpus.c
@@ -74,6 +74,20 @@
 // RUN: %clang -target arm64 -mlittle-endian -mtune=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CA72 %s
 // ARM64-CA72: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "cortex-a72"
 
+// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73 %s
+// CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a73"
+
+// RUN: %clang -target arm64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-CORTEX-A73 %s
+// ARM64-CORTEX-A73: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "cortex-a73"
+
 // RUN: %clang -target aarch64 -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
 // RUN: %clang -target aarch64 -mlittle-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
 // RUN: %clang -target aarch64_be -mlittle-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1 %s
@@ -88,6 +102,32 @@
 // RUN: %clang -target arm64 -mlittle-endian -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-M1 %s
 // ARM64-M1: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "exynos-m1"
 
+// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=KRYO %s
+// KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "kryo"
+
+// RUN: %clang -target arm64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=kryo -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-KRYO %s
+// ARM64-KRYO: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "kryo"
+
+// RUN: %clang -target aarch64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64 -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// RUN: %clang -target aarch64_be -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN %s
+// VULCAN: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "vulcan"
+
+// RUN: %clang -target arm64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mlittle-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// RUN: %clang -target arm64 -mlittle-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-VULCAN %s
+// ARM64-VULCAN: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "vulcan"
+
 // RUN: %clang -target aarch64_be -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=GENERIC-BE %s
@@ -125,6 +165,14 @@
 // RUN: %clang -target aarch64_be -mbig-endian -mtune=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CA72-BE %s
 // CA72-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "cortex-a72"
 
+// RUN: %clang -target aarch64_be -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mtune=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CORTEX-A73-BE %s
+// CORTEX-A73-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "cortex-a73"
+
 // RUN: %clang -target aarch64_be -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // RUN: %clang -target aarch64 -mbig-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
@@ -133,10 +181,21 @@
 // RUN: %clang -target aarch64_be -mbig-endian -mtune=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=M1-BE %s
 // M1-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "exynos-m1"
 
+// RUN: %clang -target aarch64_be -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -mtune=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=VULCAN-BE %s
+// VULCAN-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "vulcan"
+
 // RUN: %clang -target aarch64 -mcpu=cortex-a57 -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a57  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a72 -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a72  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=cortex-a73     -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mcpu=vulcan -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
+// RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=vulcan  -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE %s
 // MCPU-MTUNE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "cortex-a53"
 
 // RUN: %clang -target aarch64 -march=armv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV81A %s
diff --git a/test/Driver/aarch64-ras.c b/test/Driver/aarch64-ras.c
new file mode 100644
index 0000000000000..fe038eaf159e3
--- /dev/null
+++ b/test/Driver/aarch64-ras.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target aarch64-none-none-eabi -march=armv8a+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// RUN: %clang -target aarch64-none-none-eabi -mcpu=generic+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// CHECK-RAS: "-target-feature" "+ras"
+
+// RUN: %clang -target aarch64-none-none-eabi -march=armv8a+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// RUN: %clang -target aarch64-none-none-eabi -mcpu=generic+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// CHECK-NORAS: "-target-feature" "-ras"
diff --git a/test/Driver/amdgpu-features.c b/test/Driver/amdgpu-features.c
new file mode 100644
index 0000000000000..235b88f13bec7
--- /dev/null
+++ b/test/Driver/amdgpu-features.c
@@ -0,0 +1,7 @@
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=kaveri -mamdgpu-debugger-abi=0.0 %s -o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MAMDGPU-DEBUGGER-ABI-0-0 %s
+// CHECK-MAMDGPU-DEBUGGER-ABI-0-0: the clang compiler does not support '-mamdgpu-debugger-abi=0.0'
+
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=kaveri -mamdgpu-debugger-abi=1.0 %s -o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MAMDGPU-DEBUGGER-ABI-1-0 %s
+// CHECK-MAMDGPU-DEBUGGER-ABI-1-0: "-target-feature" "+amdgpu-debugger-insert-nops" "-target-feature" "+amdgpu-debugger-reserve-regs" "-target-feature" "+amdgpu-debugger-emit-prologue"
diff --git a/test/Driver/amdgpu-toolchain.c b/test/Driver/amdgpu-toolchain.c
index c84a154c7c181..52a71975b7f9a 100644
--- a/test/Driver/amdgpu-toolchain.c
+++ b/test/Driver/amdgpu-toolchain.c
@@ -1,3 +1,6 @@
 // RUN: %clang -### -target amdgcn--amdhsa -x assembler -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=AS_LINK %s
 // AS_LINK: clang{{.*}} "-cc1as"
-// AS_LINK: ld.lld{{.*}}
+// AS_LINK: ld.lld{{.*}} "-shared"
+
+// RUN: %clang -### -g -target amdgcn--amdhsa -mcpu=kaveri %s 2>&1 | FileCheck -check-prefix=DWARF_VER %s
+// DWARF_VER: "-dwarf-version=2"
diff --git a/test/Driver/android-ndk-standalone.cpp b/test/Driver/android-ndk-standalone.cpp
new file mode 100644
index 0000000000000..1ca044d58b861
--- /dev/null
+++ b/test/Driver/android-ndk-standalone.cpp
@@ -0,0 +1,283 @@
+// Test header and library paths when Clang is used with Android standalone
+// toolchain.
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  %s
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/arm-linux-androideabi/lib"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target armv7a-none-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// CHECK-ARMV7: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARMV7: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARMV7: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-ARMV7: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ARMV7: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARMV7: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARMV7: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7: "-L{{.*}}/sysroot/usr/lib"
+//
+// Other flags that can trigger armv7 mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7 \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7a \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7-a \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7 %s
+//
+// ARM thumb mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-THUMB %s
+// CHECK-THUMB: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-THUMB: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7/thumb"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7"
+// CHECK-THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-THUMB: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-THUMB: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-THUMB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7/thumb"
+// CHECK-THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-THUMB: "-L{{.*}}/sysroot/usr/lib"
+//
+// ARM V7 thumb mode.
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -march=armv7-a -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7THUMB %s
+// CHECK-ARMV7THUMB: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARMV7THUMB: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi/thumb"
+// CHECK-ARMV7THUMB-NOT: "-internal-isystem" "{{.*}}/include/c++/4.9/arm-linux-androideabi"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARMV7THUMB: "-internal-isystem" "{{.*}}/sysroot/usr/local/include"
+// CHECK-ARMV7THUMB: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ARMV7THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARMV7THUMB: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARMV7THUMB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7THUMB: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/thumb"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib/armv7-a"
+// CHECK-ARMV7THUMB-NOT: "-L{{.*}}/lib/gcc/arm-linux-androideabi/4.9/../{{[^ ]*}}/lib"
+// CHECK-ARMV7THUMB: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target armv7a-none-linux-androideabi -stdlib=libstdc++ \
+// RUN:     -mthumb \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck  --check-prefix=CHECK-ARMV7THUMB %s
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target aarch64-linux-android -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-AARCH64 %s
+// CHECK-AARCH64: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9/aarch64-linux-android"
+// CHECK-AARCH64: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-AARCH64: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-AARCH64: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-AARCH64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-AARCH64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9"
+// CHECK-AARCH64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9/../../../../aarch64-linux-android/lib"
+// CHECK-AARCH64: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target arm64-linux-android -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-ARM64 %s
+// CHECK-ARM64: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9/aarch64-linux-android"
+// CHECK-ARM64: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-ARM64: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-ARM64: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-ARM64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9"
+// CHECK-ARM64: "-L{{.*}}/lib/gcc/aarch64-linux-android/4.9/../../../../aarch64-linux-android/lib"
+// CHECK-ARM64: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -mips32 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
+// CHECK-MIPS: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android"
+// CHECK-MIPS: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPS: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9"
+// CHECK-MIPS: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib"
+// CHECK-MIPS: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPSR2 %s
+// CHECK-MIPSR2: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// NOT-YET-CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android/mips-r2"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android"
+// CHECK-MIPSR2: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPSR2: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPSR2: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/mips-r2"
+// CHECK-MIPSR2: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib"
+// CHECK-MIPSR2: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target mipsel-linux-android \
+// RUN:     -mips32r6 -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPSR6 %s
+// CHECK-MIPSR6: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// NOT-YET-CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android/mips-r6"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9/mipsel-linux-android"
+// CHECK-MIPSR6: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-MIPSR6: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-MIPSR6: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/mips-r6"
+// CHECK-MIPSR6: "-L{{.*}}/lib/gcc/mipsel-linux-android/4.9/../../../../mipsel-linux-android/lib"
+// CHECK-MIPSR6: "-L{{.*}}/sysroot/usr/lib"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target i686-linux-android \
+// RUN:     -stdlib=libstdc++ \
+// RUN:     -B%S/Inputs/basic_android_ndk_tree \
+// RUN:     --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \
+// RUN:   | FileCheck --check-prefix=CHECK-I686 %s
+// CHECK-I686: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9/i686-linux-android"
+// CHECK-I686: "-internal-isystem" "{{.*}}/include/c++/4.9/backward"
+// CHECK-I686: "-internal-externc-isystem" "{{.*}}/sysroot/include"
+// CHECK-I686: "-internal-externc-isystem" "{{.*}}/sysroot/usr/include"
+// CHECK-I686: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-I686: "-L{{.*}}/lib/gcc/i686-linux-android/4.9"
+// CHECK-I686: "-L{{.*}}/lib/gcc/i686-linux-android/4.9/../../../../i686-linux-android/lib"
+// CHECK-I686: "-L{{.*}}/sysroot/usr/lib"
diff --git a/test/Driver/android-standalone.cpp b/test/Driver/android-standalone.cpp
index d563debdf40a0..047f1707c999b 100644
--- a/test/Driver/android-standalone.cpp
+++ b/test/Driver/android-standalone.cpp
@@ -2,7 +2,7 @@
 // toolchain.
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm-linux-androideabi \
+// RUN:     -target arm-linux-androideabi -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck  %s
@@ -17,7 +17,7 @@
 // CHECK: "-L{{.*}}/sysroot/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target aarch64-linux-android \
+// RUN:     -target aarch64-linux-android -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64 %s
@@ -32,7 +32,7 @@
 // CHECK-AARCH64: "-L{{.*}}/sysroot/usr/lib"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target arm64-linux-android \
+// RUN:     -target arm64-linux-android -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-ARM64 %s
@@ -48,7 +48,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -mips32 \
+// RUN:     -mips32 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS %s
@@ -64,7 +64,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -march=mips32 -mips32r2 \
+// RUN:     -march=mips32 -mips32r2 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPSR2 %s
@@ -80,7 +80,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -mips32 -march=mips32r2 \
+// RUN:     -mips32 -march=mips32r2 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPSR2-A %s
@@ -96,7 +96,7 @@
 //
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target mipsel-linux-android \
-// RUN:     -mips32r6 \
+// RUN:     -mips32r6 -stdlib=libstdc++ \
 // RUN:     -B%S/Inputs/basic_android_tree \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree/sysroot \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPSR6 %s
diff --git a/test/Driver/arc.c b/test/Driver/arc.c
index 97d00baf4cc93..0025297b33294 100644
--- a/test/Driver/arc.c
+++ b/test/Driver/arc.c
@@ -3,7 +3,7 @@
 // RUN: not %clang -x objective-c++ -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck %s
 // RUN: not %clang -x c -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTOBJC %s
 // RUN: not %clang -x c++ -target i386-apple-darwin10 -m32 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTOBJC %s
-// RUN: not %clang -x objective-c -target x86_64-apple-darwin11 -mmacosx-version-min=10.5 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix UNSUPPORTED %s
+// RUN: not %clang -x objective-c -target x86_64-apple-darwin11 -mmacosx-version-min=10.5 -fobjc-arc %s -fsyntax-only 2>&1 | FileCheck -check-prefix NOTSUPPORTED %s
 
 // Just to test clang is working.
 # foo
@@ -14,4 +14,4 @@
 // NOTOBJC-NOT: error: -fobjc-arc is not supported on platforms using the legacy runtime
 // NOTOBJC: invalid preprocessing directive
 
-// UNSUPPORTED: error: -fobjc-arc is not supported on versions of OS X prior to 10.6
+// NOTSUPPORTED: error: -fobjc-arc is not supported on versions of OS X prior to 10.6
diff --git a/test/Driver/arch-armv7k.c b/test/Driver/arch-armv7k.c
index f5c33cfd9f587..5ebdd388abda5 100644
--- a/test/Driver/arch-armv7k.c
+++ b/test/Driver/arch-armv7k.c
@@ -5,9 +5,10 @@
 // CHECK-NOT: "-fsjlj-exceptions"
 
 // "thumbv7k-apple-ios" is a bit of a weird triple, but since the backend is
-// going to choose to use sjlj-based exceptions for it, the front-end needs to
+// going to choose to use dwarf-based exceptions for it, the front-end needs to
 // match.
 
-// RUN: %clang -target x86_64-apple-macosx10.9 -arch armv7k -miphoneos-version-min=9.0 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SJLJ
-// CHECK-SJLJ: "-cc1"{{.*}} "-target-cpu" "cortex-a7"
+// RUN: %clang -target x86_64-apple-macosx10.9 -arch armv7k -miphoneos-version-min=9.0 -c %s -### 2>&1 | FileCheck %s
+
+// RUN: %clang -target x86_64-apple-macosx10.9 -arch armv7 -mwatchos-version-min=9.0 -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SJLJ
 // CHECK-SJLJ: "-fsjlj-exceptions"
diff --git a/test/Driver/arm-abi.c b/test/Driver/arm-abi.c
index 812a849110ec4..897c108048d49 100644
--- a/test/Driver/arm-abi.c
+++ b/test/Driver/arm-abi.c
@@ -28,13 +28,17 @@
 // RUN: %clang -target arm--netbsd-eabihf %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS %s
 
-// Otherwise, ABI is celected based on environment
+// Otherwise, ABI is selected based on environment
 // RUN: %clang -target arm---android %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---gnueabi %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---gnueabihf %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
+// RUN: %clang -target arm---musleabi %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
+// RUN: %clang -target arm---musleabihf %s -### -o %t.o 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-AAPCS-LINUX %s
 // RUN: %clang -target arm---eabi %s -### -o %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-AAPCS %s
 // RUN: %clang -target arm---eabihf %s -### -o %t.o 2>&1 \
diff --git a/test/Driver/arm-alignment.c b/test/Driver/arm-alignment.c
index 3e21652039563..e0b4946d0a4b1 100644
--- a/test/Driver/arm-alignment.c
+++ b/test/Driver/arm-alignment.c
@@ -83,11 +83,13 @@
 // CHECK-ALIGNED-ARM: "-target-feature" "+strict-align"
 // CHECK-ALIGNED-AARCH64: "-target-feature" "+strict-align"
 
-// Make sure that v6M cores always trigger the unsupported aligned accesses error
-// for all supported architecture triples.
+// Make sure that v6M cores and v8M Baseline always trigger the unsupported
+// aligned accesses error for all supported architecture triples.
 // RUN: not %clang -c -target thumbv6m-none-gnueabi -mcpu=cortex-m0 -munaligned-access %s 2>&1 | \
 // RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
 // RUN: not %clang -c -target thumb-none-gnueabi -mcpu=cortex-m0 -munaligned-access %s 2>&1 | \
 // RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
+// RUN: not %clang -c -target thumbv8m.base-none-gnueabi -munaligned-access %s 2>&1 | \
+// RUN:   FileCheck --check-prefix CHECK-UNALIGN-NOT-SUPPORTED %s
 
-// CHECK-UNALIGN-NOT-SUPPORTED: error: the v6m sub-architecture does not support unaligned accesses
+// CHECK-UNALIGN-NOT-SUPPORTED: error: the {{.*}} sub-architecture does not support unaligned accesses
diff --git a/test/Driver/arm-cortex-cpus.c b/test/Driver/arm-cortex-cpus.c
index 6a4d2d631b367..4783a10596a8f 100644
--- a/test/Driver/arm-cortex-cpus.c
+++ b/test/Driver/arm-cortex-cpus.c
@@ -204,7 +204,7 @@
 // RUN: %clang -mcpu=generic -target armv8.1a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
 // RUN: %clang -mcpu=generic -target arm -march=armv8.1a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
 // RUN: %clang -mcpu=generic -target arm -mlittle-endian -march=armv8.1-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A %s
-// CHECK-V81A: "-cc1"{{.*}} "-triple" "armv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-V81A: "-cc1"{{.*}} "-triple" "armv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armebv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target armeb -march=armebv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
@@ -212,7 +212,7 @@
 // RUN: %clang -target armv8.1a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target arm -march=armebv8.1a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
 // RUN: %clang -target arm -march=armebv8.1-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A %s
-// CHECK-BE-V81A: "-cc1"{{.*}} "-triple" "armebv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-BE-V81A: "-cc1"{{.*}} "-triple" "armebv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
@@ -220,7 +220,7 @@
 // RUN: %clang -target armv8.1a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
 // RUN: %clang -target arm -march=armv8.1-a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V81A-THUMB %s
-// CHECK-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.1a-{{.*}}" "-target-cpu" "generic"
 
 // RUN: %clang -target armebv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target armeb -march=armebv8.1a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
@@ -228,7 +228,68 @@
 // RUN: %clang -target armv8.1a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target arm -march=armebv8.1a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
 // RUN: %clang -target arm -march=armebv8.1-a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V81A-THUMB %s
-// CHECK-BE-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.1a-{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+// CHECK-BE-V81A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.1a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv8.2-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A %s
+// CHECK-V82A: "-cc1"{{.*}} "-triple" "armv8.2{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armv8.2a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armeb -march=armebv8.2a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target armeb -march=armebv8.2-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target arm -march=armebv8.2a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// RUN: %clang -target arm -march=armebv8.2-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A %s
+// CHECK-BE-V82A: "-cc1"{{.*}} "-triple" "armebv8.2{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2-a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target armv8.2a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// RUN: %clang -target arm -march=armv8.2-a -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V82A-THUMB %s
+// CHECK-V82A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8.2a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armeb -march=armebv8.2a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armeb -march=armebv8.2-a -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target armv8.2a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target arm -march=armebv8.2a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// RUN: %clang -target arm -march=armebv8.2-a -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V82A-THUMB %s
+// CHECK-BE-V82A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8.2a-{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv8a -march=armv8.2-a+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-V82A-FP16 %s
+// CHECK-V82A-FP16: "-cc1"{{.*}} "-triple" "armv8.2{{.*}}" "-target-cpu" "generic" {{.*}}"-target-feature" "+fullfp16"
+
+// Once we have CPUs with optional v8.2-A FP16, we will need a way to turn it
+// on and off. Cortex-A53 is a placeholder for now.
+// RUN: %clang -target armv8a -mcpu=cortex-a53+fp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-CORTEX-A53-FP16 %s
+// RUN: %clang -target armv8a -mcpu=cortex-a53+nofp16 -### -c %s 2>&1 | FileCheck --check-prefix CHECK-CORTEX-A53-NOFP16 %s
+// CHECK-CORTEX-A53-FP16: "-cc1" {{.*}}"-target-cpu" "cortex-a53" {{.*}}"-target-feature" "+fullfp16"
+// CHECK-CORTEX-A53-NOFP16: "-cc1" {{.*}}"-target-cpu" "cortex-a53" {{.*}}"-target-feature" "-fullfp16"
+
+// RUN: %clang -target armv8m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target arm -march=armv8-m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target arm -march=armv8m.base %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_BASELINE
+// RUN: %clang -target armv8m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// RUN: %clang -target arm -march=armv8-m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// RUN: %clang -target arm -march=armv8m.base -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_BASELINE
+// V8M_BASELINE: "-cc1"{{.*}} "-triple" "thumbv8m.base-{{.*}} "-target-cpu" "generic"
+// EBV8M_BASELINE: "-cc1"{{.*}} "-triple" "thumbebv8m.base-{{.*}} "-target-cpu" "generic"
+
+// RUN: %clang -target armv8m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target arm -march=armv8-m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target arm -march=armv8m.main %s -### -c 2>&1 | FileCheck %s --check-prefix=V8M_MAINLINE
+// RUN: %clang -target armv8m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// RUN: %clang -target arm -march=armv8-m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// RUN: %clang -target arm -march=armv8m.main -mbig-endian %s -### -c 2>&1 | FileCheck %s --check-prefix=EBV8M_MAINLINE
+// V8M_MAINLINE: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"
+// EBV8M_MAINLINE: "-cc1"{{.*}} "-triple" "thumbebv8m.main-{{.*}} "-target-cpu" "generic"
 
 // ================== Check that a bogus architecture gives an error
 // RUN: %clang -target arm -march=armbogusv6 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BOGUS %s
@@ -358,90 +419,123 @@
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R %s
 // CHECK-CPUV7R: "-cc1"{{.*}} "-triple" "armv7r-{{.*}}
 
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4f -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
+// RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R %s
 // CHECK-BE-CPUV7R: "-cc1"{{.*}} "-triple" "armebv7r-{{.*}}
 
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV7R-THUMB %s
 // CHECK-CPUV7R-THUMB: "-cc1"{{.*}} "-triple" "thumbv7r-{{.*}}
 
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r4f -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r5 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r7 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
+// RUN: %clang -target armeb-linux-gnueabi -mcpu=cortex-r8 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r4f -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r5 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r7 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
+// RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-r8 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV7R-THUMB %s
 // CHECK-BE-CPUV7R-THUMB: "-cc1"{{.*}} "-triple" "thumbebv7r-{{.*}}
 
+// RUN: %clang -target arm -mcpu=cortex-a32 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A %s
 // CHECK-CPUV8A: "-cc1"{{.*}} "-triple" "armv8-{{.*}}
 
+// RUN: %clang -target armeb -mcpu=cortex-a32 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a35 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target armeb -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target armeb -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A %s
 // CHECK-BE-CPUV8A: "-cc1"{{.*}} "-triple" "armebv8-{{.*}}
 
+// RUN: %clang -target arm -mcpu=cortex-a32 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mlittle-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CPUV8A-THUMB %s
 // CHECK-CPUV8A-THUMB: "-cc1"{{.*}} "-triple" "thumbv8-{{.*}}
 
+// RUN: %clang -target armeb -mcpu=cortex-a32 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a35 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a53 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a57 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=cortex-a72 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target armeb -mcpu=cortex-a73 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target armeb -mcpu=exynos-m1 -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a32 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a35 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a53 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a57 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=cortex-a72 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
+// RUN: %clang -target arm -mcpu=cortex-a73 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // RUN: %clang -target arm -mcpu=exynos-m1 -mbig-endian -mthumb -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-CPUV8A-THUMB %s
 // CHECK-BE-CPUV8A-THUMB: "-cc1"{{.*}} "-triple" "thumbebv8-{{.*}}
 
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73 %s
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -mfpu=crypto-neon-fp-armv8 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73-MFPU %s
+// RUN: %clang -target armv8a-arm-none-eabi -mcpu=cortex-a73 -mfloat-abi=soft -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CORTEX-A73-SOFT %s
+// CHECK-CORTEX-A73: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "cortex-a73"
+// CHECK-CORTEX-A73-MFPU: "-cc1"{{.*}} "-target-feature" "+fp-armv8"
+// CHECK-CORTEX-A73-MFPU: "-target-feature" "+crypto"
+// CHECK-CORTEX-A73-SOFT: "-target-feature" "+soft-float"
+// CHECK-CORTEX-A73-SOFT: "-target-feature" "+soft-float-abi"
+
 // ================== Check whether -mcpu accepts mixed-case values.
 // RUN: %clang -target arm-linux-gnueabi -mcpu=Cortex-a5 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CASE-INSENSITIVE-CPUV7A %s
 // RUN: %clang -target arm-linux-gnueabi -mcpu=cortex-A7 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CASE-INSENSITIVE-CPUV7A %s
diff --git a/test/Driver/arm-features.c b/test/Driver/arm-features.c
index eb197da935a03..74cedf3bd83b7 100644
--- a/test/Driver/arm-features.c
+++ b/test/Driver/arm-features.c
@@ -4,6 +4,9 @@
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+crypto -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-CRYPTO %s
 // CHECK-CRYPTO: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+crypto"
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+dsp -march=armv8m.main -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-DSP %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8m.main+dsp -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-DSP %s
+// CHECK-DSP: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "+dsp"
 
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+nocrc -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRC %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+nocrc -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRC %s
@@ -11,3 +14,6 @@
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic+nocrypto -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO %s
 // RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8a+nocrypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NOCRYPTO %s
 // CHECK-NOCRYPTO: "-cc1"{{.*}} "-triple" "armv8-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "-crypto"
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+nodsp -march=armv8m.main -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NODSP %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic -march=armv8m.main+nodsp -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-NODSP %s
+// CHECK-NODSP: "-cc1"{{.*}} "-triple" "thumbv8m.main-{{.*}} "-target-cpu" "generic"{{.*}} "-target-feature" "-dsp"
diff --git a/test/Driver/arm-mfpu.c b/test/Driver/arm-mfpu.c
index 93fb0a8eb4360..2e1c00dbcced2 100644
--- a/test/Driver/arm-mfpu.c
+++ b/test/Driver/arm-mfpu.c
@@ -207,6 +207,8 @@
 
 // RUN: %clang -target arm-linux-gnueabihf %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-HF %s
+// RUN: %clang -target arm-linux-musleabihf %s -### 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-HF %s
 // CHECK-HF: "-target-cpu" "arm1176jzf-s"
 
 // RUN: %clang -target armv7-apple-darwin -x assembler %s -### -c 2>&1 \
diff --git a/test/Driver/arm-ras.c b/test/Driver/arm-ras.c
new file mode 100644
index 0000000000000..6d2168c160bf6
--- /dev/null
+++ b/test/Driver/arm-ras.c
@@ -0,0 +1,7 @@
+// RUN: %clang -target arm-none-none-eabi -march=armv8a+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+ras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-RAS %s
+// CHECK-RAS: "-target-feature" "+ras"
+
+// RUN: %clang -target arm-none-none-eabi -march=armv8a+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// RUN: %clang -target arm-none-none-eabi -mcpu=generic+noras -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-NORAS %s
+// CHECK-NORAS: "-target-feature" "-ras"
diff --git a/test/Driver/as-dwarf-cie.s b/test/Driver/as-dwarf-cie.s
new file mode 100644
index 0000000000000..73d987afd4287
--- /dev/null
+++ b/test/Driver/as-dwarf-cie.s
@@ -0,0 +1,37 @@
+# REQUIRES: x86-registered-target
+# Test that there is a sane default CIE version.
+# RUN: %clang -cc1as -triple i386-apple-darwin -filetype obj %s -o %t
+# RUN: llvm-objdump -dwarf=frames %t | FileCheck %s
+# CHECK: .debug_frame contents:
+# CHECK: CIE
+# CHECK: Version:               1
+	.section	__TEXT,__text,regular,pure_instructions
+	.globl	_f
+	.p2align	4, 0x90
+_f:                                     ## @f
+Lfunc_begin0:
+	.file	1 "test.c"
+	.loc	1 1 0                   ## test.c:1:0
+	.cfi_startproc
+## BB#0:                                ## %entry
+	pushl	%ebp
+Ltmp0:
+	.cfi_def_cfa_offset 8
+Ltmp1:
+	.cfi_offset %ebp, -8
+	movl	%esp, %ebp
+Ltmp2:
+	.cfi_def_cfa_register %ebp
+Ltmp3:
+	.loc	1 1 11 prologue_end     ## test.c:1:11
+	popl	%ebp
+	retl
+Ltmp4:
+Lfunc_end0:
+	.cfi_endproc
+	.cfi_sections .debug_frame
+
+.subsections_via_symbols
+	.section	__DWARF,__debug_line,regular,debug
+Lsection_line:
+Lline_table_start0:
diff --git a/test/Driver/at_file.c b/test/Driver/at_file.c
index 0541ece88b1fd..56cc5c684c434 100644
--- a/test/Driver/at_file.c
+++ b/test/Driver/at_file.c
@@ -14,9 +14,9 @@
 // CHECK-NEXT: foo9'bar9'zed9
 // CHECK-NEXT: foo10"bar10"zed10
 // CHECK: bar
-// CHECK: zed12
+// CHECK: zed1
 // CHECK: one\two
-// CHECK: c:\foo\bar.c
+// CHECK: c:foobar.c
 
 foo1
 foo2
diff --git a/test/Driver/at_file.c.args b/test/Driver/at_file.c.args
index 8739000e3c577..ccedd8295c7c7 100644
--- a/test/Driver/at_file.c.args
+++ b/test/Driver/at_file.c.args
@@ -8,6 +8,7 @@
 -Dfoo9=foo9\'bar9\'zed9
 -Dfoo10=foo10\"bar10\"zed10
 -D foo11
--Dfoo12=zed12\
+-Dfoo12=zed1\
+2
 -Dfoo13='one\\two'
 -Dfoo14='c:\foo\bar.c'
diff --git a/test/Driver/at_file_missing.c b/test/Driver/at_file_missing.c
index 0189a8bb7571b..23645a5d3f93a 100644
--- a/test/Driver/at_file_missing.c
+++ b/test/Driver/at_file_missing.c
@@ -1,7 +1,7 @@
 // Make sure that arguments that begin with @ are left as is in the argument
 // stream, and also that @file arguments continue to be processed.
 
-// RUN: echo "%s -D FOO" > %t.args
-// RUN: %clang -rpath @executable_path/../lib @%t.args -### 2>&1 | FileCheck %s
+// RUN: echo "-D FOO" > %t.args
+// RUN: %clang -rpath @executable_path/../lib @%t.args %s -### 2>&1 | FileCheck %s
 // CHECK: "-D" "FOO"
 // CHECK: "-rpath" "@executable_path/../lib"
diff --git a/test/Driver/at_file_win.c b/test/Driver/at_file_win.c
new file mode 100644
index 0000000000000..9a8ede548a214
--- /dev/null
+++ b/test/Driver/at_file_win.c
@@ -0,0 +1,34 @@
+// RUN: %clang --rsp-quoting=windows -E %s @%s.args -o %t.log
+// RUN: FileCheck --input-file=%t.log %s
+
+// CHECK: bar1
+// CHECK-NEXT: bar2 zed2
+// CHECK-NEXT: bar3 zed3
+// CHECK-NEXT: bar4 zed4
+// CHECK-NEXT: bar5 zed5
+// CHECK-NEXT: 'bar6 zed6'
+// CHECK-NEXT: 'bar7 zed7'
+// CHECK-NEXT: foo8bar8zed8
+// CHECK-NEXT: foo9\'bar9\'zed9
+// CHECK-NEXT: foo10"bar10"zed10
+// CHECK: bar
+// CHECK: zed12
+// CHECK: one\two
+// CHECK: c:\foo\bar.c
+
+foo1
+foo2
+foo3
+foo4
+foo5
+foo6
+foo7
+foo8
+foo9
+foo10
+#ifdef foo11
+bar
+#endif
+foo12
+foo13
+foo14
diff --git a/test/Driver/at_file_win.c.args b/test/Driver/at_file_win.c.args
new file mode 100644
index 0000000000000..df109e4d5fd53
--- /dev/null
+++ b/test/Driver/at_file_win.c.args
@@ -0,0 +1,13 @@
+-Dfoo1=bar1 -Dfoo2="bar2 zed2"
+-Dfoo3="bar3 zed3"
+"-Dfoo4=bar4 zed4"
+"-Dfoo5=bar5 zed5"
+-Dfoo6="'bar6 zed6'"
+-Dfoo7='"bar7 zed7"'
+-Dfoo8=foo8"bar8"zed8
+-Dfoo9=foo9\'bar9\'zed9
+-Dfoo10=foo10\"bar10\"zed10
+-D foo11
+-Dfoo12=zed12
+-Dfoo13=one\two
+-Dfoo14=c:\foo\bar.c
diff --git a/test/Driver/bitrig.c b/test/Driver/bitrig.c
index 934cb02f22403..a20a95aecdcca 100644
--- a/test/Driver/bitrig.c
+++ b/test/Driver/bitrig.c
@@ -3,7 +3,7 @@
 // CHECK-LD-C: clang{{.*}}" "-cc1" "-triple" "amd64-pc-bitrig"
 // CHECK-LD-C: ld{{.*}}" {{.*}} "-lc" "-lclang_rt.amd64"
 
-// RUN: %clangxx -no-canonical-prefixes -target amd64-pc-bitrig %s -### 2>&1 \
+// RUN: %clangxx -stdlib=platform -no-canonical-prefixes -target amd64-pc-bitrig %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-CXX-STDLIB %s
 // CHECK-LD-CXX-STDLIB: clang{{.*}}" "-cc1" "-triple" "amd64-pc-bitrig"
 // CHECK-LD-CXX-STDLIB: ld{{.*}}" {{.*}} "-lc++" "-lc++abi" "-lpthread" "-lm" "-lc" "-lclang_rt.amd64"
diff --git a/test/Driver/cl-eh.cpp b/test/Driver/cl-eh.cpp
index 1745616ea9b1a..c54544b385c1b 100644
--- a/test/Driver/cl-eh.cpp
+++ b/test/Driver/cl-eh.cpp
@@ -21,6 +21,11 @@
 // EHs_EHa: "-fcxx-exceptions"
 // EHs_EHa: "-fexceptions"
 
+// RUN: %clang_cl /c /EHa /EHc -### -- %s 2>&1 | FileCheck -check-prefix=EHa_EHc %s
+// EHa_EHc: "-fcxx-exceptions"
+// EHa_EHc: "-fexceptions"
+// EHa_EHc-NOT: "-fexternc-nounwind"
+
 // RUN: %clang_cl /c /EHinvalid -### -- %s 2>&1 | FileCheck -check-prefix=EHinvalid %s
 // EHinvalid: error: invalid value 'invalid' in '/EH'
 // EHinvalid-NOT: error:
diff --git a/test/Driver/cl-fallback.c b/test/Driver/cl-fallback.c
index e5ebde5c01821..e73f7c03c2630 100644
--- a/test/Driver/cl-fallback.c
+++ b/test/Driver/cl-fallback.c
@@ -1,8 +1,9 @@
 // Note: %s must be preceded by --, otherwise it may be interpreted as a
 // command-line option, e.g. on Mac where %s is commonly under /Users.
 
-// RUN: %clang_cl --target=i686-pc-win32 /fallback /Dfoo=bar /Ubaz /Ifoo /O0 /Ox /GR /GR- /Gy /Gy- \
+// RUN: %clang_cl --target=i686-pc-win32 /fallback /Dfoo=bar /Ubaz /Ifoo /O0 /Ox /GR /GR- /GS /GS- /Gy /Gy- \
 // RUN:   /Gw /Gw- /LD /LDd /EHs /EHs- /Zl /MD /MDd /MTd /MT /FImyheader.h /Zi \
+// RUN:   -garbage -moregarbage \
 // RUN:   -### -- %s 2>&1 \
 // RUN:   | FileCheck %s
 // CHECK: "-fdiagnostics-format" "msvc-fallback"
@@ -21,6 +22,7 @@
 // CHECK: "/Oy"
 // CHECK: "/GF"
 // CHECK: "/GR-"
+// CHECK: "/GS-"
 // CHECK: "/Gy-"
 // CHECK: "/Gw-"
 // CHECK: "/Z7"
@@ -31,6 +33,8 @@
 // CHECK: "/EHs-"
 // CHECK: "/Zl"
 // CHECK: "/MT"
+// CHECK: "-garbage"
+// CHECK: "-moregarbage"
 // CHECK: "/Tc" "{{.*cl-fallback.c}}"
 // CHECK: "/Fo{{.*cl-fallback.*.obj}}"
 
@@ -38,6 +42,10 @@
 // GR: cl.exe
 // GR: "/GR-"
 
+// RUN: %clang_cl /fallback /GS- -### -- %s 2>&1 | FileCheck -check-prefix=GS %s
+// GS: cl.exe
+// GS: "/GS-"
+
 // RUN: %clang_cl /fallback /Od -### -- %s 2>&1 | FileCheck -check-prefix=O0 %s
 // O0: cl.exe
 // O0: "/Od"
diff --git a/test/Driver/cl-link.c b/test/Driver/cl-link.c
index 9813c51d80af6..026c43383cb90 100644
--- a/test/Driver/cl-link.c
+++ b/test/Driver/cl-link.c
@@ -3,6 +3,7 @@
 // under /Users.
 
 // RUN: %clang_cl /Tc%s -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
+// RUN: %clang_cl /Tc%s -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s
 // LINK: link.exe
 // LINK: "foo"
 // LINK: "bar"
@@ -42,3 +43,11 @@
 // RUN: %clang_cl /Zi /Tc%s -### 2>&1 | FileCheck --check-prefix=DEBUG %s
 // DEBUG: link.exe
 // DEBUG: "-debug"
+
+// PR27234
+// RUN: %clang_cl /Tc%s nonexistent.obj -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// RUN: %clang_cl /Tc%s nonexistent.lib -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s
+// NONEXISTENT-NOT: no such file
+// NONEXISTENT: link.exe
+// NONEXISTENT: "/libpath:somepath"
+// NONEXISTENT: nonexistent
diff --git a/test/Driver/cl-options.c b/test/Driver/cl-options.c
index c5985a9c2b247..223d37e5f440c 100644
--- a/test/Driver/cl-options.c
+++ b/test/Driver/cl-options.c
@@ -59,6 +59,16 @@
 // RUN: %clang_cl /GR- -### -- %s 2>&1 | FileCheck -check-prefix=GR_ %s
 // GR_: -fno-rtti
 
+// Security Buffer Check is on by default.
+// RUN: %clang_cl -### -- %s 2>&1 | FileCheck -check-prefix=GS-default %s
+// GS-default: "-stack-protector" "2"
+
+// RUN: %clang_cl /GS -### -- %s 2>&1 | FileCheck -check-prefix=GS %s
+// GS: "-stack-protector" "2"
+
+// RUN: %clang_cl /GS- -### -- %s 2>&1 | FileCheck -check-prefix=GS_ %s
+// GS_-NOT: -stack-protector
+
 // RUN: %clang_cl /Gy -### -- %s 2>&1 | FileCheck -check-prefix=Gy %s
 // Gy: -ffunction-sections
 
@@ -82,6 +92,12 @@
 // RUN: %clang_cl /I myincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_I %s
 // SLASH_I: "-I" "myincludedir"
 
+// RUN: %clang_cl /imsvcmyincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_imsvc %s
+// RUN: %clang_cl /imsvc myincludedir -### -- %s 2>&1 | FileCheck -check-prefix=SLASH_imsvc %s
+// Clang's resource header directory should be first:
+// SLASH_imsvc: "-internal-isystem" "{{[^"]*}}lib{{(64)?/|\\\\}}clang{{[^"]*}}include"
+// SLASH_imsvc: "-internal-isystem" "myincludedir"
+
 // RUN: %clang_cl /J -### -- %s 2>&1 | FileCheck -check-prefix=J %s
 // J: -fno-signed-char
 
@@ -91,6 +107,16 @@
 // RUN: %clang_cl /Ob0 -### -- %s 2>&1 | FileCheck -check-prefix=Ob0 %s
 // Ob0: -fno-inline
 
+// RUN: %clang_cl /Ob2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// RUN: %clang_cl /Odb2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// RUN: %clang_cl /O2 /Ob2 -### -- %s 2>&1 | FileCheck -check-prefix=Ob2 %s
+// Ob2-NOT: warning: argument unused during compilation: '/O2'
+// Ob2: -finline-functions
+
+// RUN: %clang_cl /Ob1 -### -- %s 2>&1 | FileCheck -check-prefix=Ob1 %s
+// RUN: %clang_cl /Odb1 -### -- %s 2>&1 | FileCheck -check-prefix=Ob1 %s
+// Ob1: -finline-hint-functions
+
 // RUN: %clang_cl /Od -### -- %s 2>&1 | FileCheck -check-prefix=Od %s
 // Od: -O0
 
@@ -123,9 +149,13 @@
 // PR24003: -momit-leaf-frame-pointer
 // PR24003: -Os
 
-// RUN: %clang_cl /Zs /Oy -- %s 2>&1
+// RUN: %clang_cl --target=i686-pc-win32 -Werror /Oy- /O2 -### -- %s 2>&1 | FileCheck -check-prefix=Oy_2 %s
+// Oy_2: -momit-leaf-frame-pointer
+// Oy_2: -O2
+
+// RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1
 
-// RUN: %clang_cl --target=i686-pc-win32 /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s
+// RUN: %clang_cl --target=i686-pc-win32 -Werror /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s
 // Oy_: -mdisable-fp-elim
 
 // RUN: %clang_cl /Qvec -### -- %s 2>&1 | FileCheck -check-prefix=Qvec %s
@@ -207,6 +237,15 @@
 // RUN: %clang_cl /FI asdf.h -### -- %s 2>&1 | FileCheck -check-prefix=FI_ %s
 // FI_: "-include" "asdf.h"
 
+// RUN: %clang_cl /TP /c -### -- %s 2>&1 | FileCheck -check-prefix=NO-GX %s
+// NO-GX-NOT: "-fcxx-exceptions" "-fexceptions"
+
+// RUN: %clang_cl /TP /c /GX -### -- %s 2>&1 | FileCheck -check-prefix=GX %s
+// GX: "-fcxx-exceptions" "-fexceptions"
+
+// RUN: %clang_cl /TP /c /GX /GX- -### -- %s 2>&1 | FileCheck -check-prefix=GX_ %s
+// GX_-NOT: "-fcxx-exceptions" "-fexceptions"
+
 // We forward any unrecognized -W diagnostic options to cc1.
 // RUN: %clang_cl -Wunused-pragmas -### -- %s 2>&1 | FileCheck -check-prefix=WJoined %s
 // WJoined: "-cc1"
@@ -234,8 +273,10 @@
 // RUN:    /bigobj \
 // RUN:    /cgthreads4 \
 // RUN:    /cgthreads8 \
+// RUN:    /d2FastFail \
 // RUN:    /d2Zi+ \
 // RUN:    /errorReport:foo \
+// RUN:    /FC \
 // RUN:    /Fdfoo \
 // RUN:    /FS \
 // RUN:    /Gd \
@@ -243,8 +284,6 @@
 // RUN:    /GS- \
 // RUN:    /kernel- \
 // RUN:    /nologo \
-// RUN:    /Ob1 \
-// RUN:    /Ob2 \
 // RUN:    /openmp- \
 // RUN:    /RTC1 \
 // RUN:    /sdl \
@@ -366,7 +405,7 @@
 // RTTI-NOT: "-fno-rtti"
 
 // thread safe statics are off for versions < 19.
-// RUN: %clang_cl /c -### -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
+// RUN: %clang_cl /c -### -fms-compatibility-version=18 -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
 // RUN: %clang_cl /Zc:threadSafeInit /Zc:threadSafeInit- /c -### -- %s 2>&1 | FileCheck -check-prefix=NoThreadSafeStatics %s
 // NoThreadSafeStatics: "-fno-threadsafe-statics"
 
@@ -375,11 +414,15 @@
 
 // RUN: %clang_cl /Zi /c -### -- %s 2>&1 | FileCheck -check-prefix=Zi %s
 // Zi: "-gcodeview"
-// Zi: "-debug-info-kind=line-tables-only"
+// Zi: "-debug-info-kind=limited"
 
 // RUN: %clang_cl /Z7 /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7 %s
 // Z7: "-gcodeview"
-// Z7: "-debug-info-kind=line-tables-only"
+// Z7: "-debug-info-kind=limited"
+
+// RUN: %clang_cl /Zd /c -### -- %s 2>&1 | FileCheck -check-prefix=Z7GMLT %s
+// Z7GMLT: "-gcodeview"
+// Z7GMLT: "-debug-info-kind=line-tables-only"
 
 // RUN: %clang_cl /c -### -- %s 2>&1 | FileCheck -check-prefix=BreproDefault %s
 // BreproDefault: "-mincremental-linker-compatible"
@@ -409,6 +452,12 @@
 // RUN: %clang_cl -fmsc-version=1900 -TP -### -- %s 2>&1 | FileCheck -check-prefix=CXX14 %s
 // CXX14: -std=c++14
 
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++14 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX14 %s
+// STDCXX14: -std=c++14
+
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s
+// STDCXXLATEST: -std=c++1z
+
 // RUN: env CL="/Gy" %clang_cl -### -- %s 2>&1 | FileCheck -check-prefix=ENV-CL %s
 // ENV-CL: "-ffunction-sections"
 
@@ -434,6 +483,7 @@
 // RUN:     -fms-extensions \
 // RUN:     -fno-ms-extensions \
 // RUN:     -mllvm -disable-llvm-optzns \
+// RUN:     -resource-dir \
 // RUN:     -Wunused-variable \
 // RUN:     -fmacro-backtrace-limit=0 \
 // RUN:     -Werror /Zs -- %s 2>&1
diff --git a/test/Driver/cl-pch-errorhandling.cpp b/test/Driver/cl-pch-errorhandling.cpp
new file mode 100644
index 0000000000000..879c8cf8e7dad
--- /dev/null
+++ b/test/Driver/cl-pch-errorhandling.cpp
@@ -0,0 +1,15 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// /Yc but pch generation fails => main file not compiled
+// This is a separate file since executing this failure path requires
+// code generation, which makes this test require an x86 backend.
+// REQUIRES: x86-registered-target
+
+// RUN: not %clang_cl -Werror /Yc%S/Inputs/pchfile.h /FI%S/Inputs/pchfile.h /Fp%t.pch /c -DERR_HEADER -- %s 2>&1 \
+// RUN:   | FileCheck %s
+
+// CHECK: nope1
+// CHECK-NOT: nope2
+
+#error nope2
diff --git a/test/Driver/cl-pch-search.cpp b/test/Driver/cl-pch-search.cpp
new file mode 100644
index 0000000000000..118cf19efbd54
--- /dev/null
+++ b/test/Driver/cl-pch-search.cpp
@@ -0,0 +1,6 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// REQUIRES: x86-registered-target
+// Check that pchfile.h next to to pchfile.cc is found correctly.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c /Fo%t.obj /Fp%t.pch -- %S/Inputs/pchfile.cpp 
diff --git a/test/Driver/cl-pch-showincludes.cpp b/test/Driver/cl-pch-showincludes.cpp
new file mode 100644
index 0000000000000..8115e6b5d380d
--- /dev/null
+++ b/test/Driver/cl-pch-showincludes.cpp
@@ -0,0 +1,50 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// Tests interaction of /Yc / /Yu with /showIncludes
+// REQUIRES: x86-registered-target
+
+#include "header3.h"
+
+// When building the pch, header1.h (included by header2.h), header2.h (the pch
+// input itself) and header3.h (included directly, above) should be printed.
+// RUN: %clang_cl -Werror /showIncludes /I%S/Inputs /Ycheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YC %s
+// CHECK-YC: Note: including file: {{[^ ]*header2.h}}
+// CHECK-YC: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-YC: Note: including file: {{[^ ]*header3.h}}
+
+// When using the pch, only the direct include is printed.
+// RUN: %clang_cl -Werror /showIncludes /I%S/Inputs /Yuheader2.h /FIheader2.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YU %s
+// CHECK-YU-NOT: Note: including file: {{.*pch}}
+// CHECK-YU-NOT: Note: including file: {{.*header1.h}}
+// CHECK-YU-NOT: Note: including file: {{.*header2.h}}
+// CHECK-YU: Note: including file: {{[^ ]*header3.h}}
+
+// When not using pch at all, all the /FI files are printed.
+// RUN: %clang_cl -Werror /showIncludes /I%S/Inputs /FIheader2.h /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-FI %s
+// CHECK-FI: Note: including file: {{[^ ]*header2.h}}
+// CHECK-FI: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-FI: Note: including file: {{[^ ]*header3.h}}
+
+// Also check that /FI arguments before the /Yc / /Yu flags are printed right.
+
+// /FI flags before the /Yc arg should be printed, /FI flags after it shouldn't.
+// RUN: %clang_cl -Werror /showIncludes /I%S/Inputs /Ycheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YCFI %s
+// CHECK-YCFI: Note: including file: {{[^ ]*header0.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header2.h}}
+// CHECK-YCFI: Note: including file:  {{[^ ]*header1.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header4.h}}
+// CHECK-YCFI: Note: including file: {{[^ ]*header3.h}}
+
+// RUN: %clang_cl -Werror /showIncludes /I%S/Inputs /Yuheader2.h /FIheader0.h /FIheader2.h /FIheader4.h /Fp%t.pch /c /Fo%t -- %s \
+// RUN:   | FileCheck --strict-whitespace -check-prefix=CHECK-YUFI %s
+// CHECK-YUFI-NOT: Note: including file: {{.*pch}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header0.h}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header2.h}}
+// CHECK-YUFI-NOT: Note: including file: {{.*header1.h}}
+// CHECK-YUFI: Note: including file: {{[^ ]*header4.h}}
+// CHECK-YUFI: Note: including file: {{[^ ]*header3.h}}
diff --git a/test/Driver/cl-pch.c b/test/Driver/cl-pch.c
new file mode 100644
index 0000000000000..3372c184bbf32
--- /dev/null
+++ b/test/Driver/cl-pch.c
@@ -0,0 +1,45 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// The main test for clang-cl pch handling is cl-pch.cpp.  This file only checks
+// a few things for .c inputs.
+
+// /Yc with a .c file should build a c pch file.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC %s
+// CHECK-YC: cc1
+// CHECK-YC: -emit-pch
+// CHECK-YC: -o
+// CHECK-YC: pchfile.pch
+// CHECK-YC: -x
+// CHECK-YC: "c"
+
+// But not if /TP changes the input language to C++.
+// RUN: %clang_cl /TP -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTP %s
+// CHECK-YCTP: cc1
+// CHECK-YCTP: -emit-pch
+// CHECK-YCTP: -o
+// CHECK-YCTP: pchfile.pch
+// CHECK-YCTP: -x
+// CHECK-YCTP: "c++"
+
+// Except if a later /TC changes it back.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTPTC %s
+// CHECK-YCTPTC: cc1
+// CHECK-YCTPTC: -emit-pch
+// CHECK-YCTPTC: -o
+// CHECK-YCTPTC: pchfile.pch
+// CHECK-YCTPTC: -x
+// CHECK-YCTPTC: "c"
+
+// Also check lower-case /Tp flag.
+// RUN: %clang_cl -Werror /Tp%s /Ycpchfile.h /FIpchfile.h /c -### 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTp %s
+// CHECK-YCTp: cc1
+// CHECK-YCTp: -emit-pch
+// CHECK-YCTp: -o
+// CHECK-YCTp: pchfile.pch
+// CHECK-YCTp: -x
+// CHECK-YCTp: "c++"
diff --git a/test/Driver/cl-pch.cpp b/test/Driver/cl-pch.cpp
new file mode 100644
index 0000000000000..8d701da5b0b2b
--- /dev/null
+++ b/test/Driver/cl-pch.cpp
@@ -0,0 +1,324 @@
+// Note: %s and %S must be preceded by --, otherwise it may be interpreted as a
+// command-line option, e.g. on Mac where %s is commonly under /Users.
+
+// /Yc
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC %s
+// 1. Build .pch file.
+// CHECK-YC: cc1
+// CHECK-YC: -emit-pch
+// CHECK-YC: -o
+// CHECK-YC: pchfile.pch
+// CHECK-YC: -x
+// CHECK-YC: "c++"
+// 2. Use .pch file.
+// CHECK-YC: cc1
+// CHECK-YC: -emit-obj
+// CHECK-YC: -include-pch
+// CHECK-YC: pchfile.pch
+
+// /Yc /Fo
+// /Fo overrides the .obj output filename, but not the .pch filename
+// RUN: %clang_cl -Werror /Fomyobj.obj /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCO %s
+// 1. Build .pch file.
+// CHECK-YCO: cc1
+// CHECK-YCO: -emit-pch
+// CHECK-YCO: -o
+// CHECK-YCO: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YCO: cc1
+// CHECK-YCO: -emit-obj
+// CHECK-YCO: -include-pch
+// CHECK-YCO: pchfile.pch
+// CHECK-YCO: -o
+// CHECK-YCO: myobj.obj
+
+// /Yc /Y-
+// /Y- disables pch generation
+// RUN: %clang_cl -Werror /Y- /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-Y_ %s
+// CHECK-YC-Y_-NOT: -emit-pch
+// CHECK-YC-Y_-NOT: -include-pch
+
+// /Yu
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU %s
+// Use .pch file, but don't build it.
+// CHECK-YU-NOT: -emit-pch
+// CHECK-YU: cc1
+// CHECK-YU: -emit-obj
+// CHECK-YU: -include-pch
+// CHECK-YU: pchfile.pch
+
+// /Yu /Y-
+// RUN: %clang_cl -Werror /Y- /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-Y_ %s
+// CHECK-YU-Y_-NOT: -emit-pch
+// CHECK-YU-Y_-NOT: -include-pch
+
+// /Yc /Yu -- /Yc overrides /Yc if they both refer to the same file
+// RUN: %clang_cl -Werror /Ycpchfile.h /Yupchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU %s
+// 1. Build .pch file.
+// CHECK-YC-YU: cc1
+// CHECK-YC-YU: -emit-pch
+// CHECK-YC-YU: -o
+// CHECK-YC-YU: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YC-YU: cc1
+// CHECK-YC-YU: -emit-obj
+// CHECK-YC-YU: -include-pch
+// CHECK-YC-YU: pchfile.pch
+
+// If /Yc /Yu refer to different files, semantics are pretty wonky.  Since this
+// doesn't seem like something that's important in practice, just punt for now.
+// RUN: %clang_cl -Werror /Ycfoo1.h /Yufoo2.h /FIfoo1.h /FIfoo2.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU-MISMATCH %s
+// CHECK-YC-YU-MISMATCH: error: support for '/Yc' and '/Yu' with different filenames not implemented yet; flags ignored
+
+// Similarly, punt on /Yc with more than one input file.
+// RUN: %clang_cl -Werror /Ycfoo1.h /FIfoo1.h /c -### -- %s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-MULTIINPUT %s
+// CHECK-YC-MULTIINPUT: error: support for '/Yc' with more than one source file not implemented yet; flag ignored
+
+// /Yc /Yu /Y-
+// RUN: %clang_cl -Werror /Ycpchfile.h /Yupchfile.h /FIpchfile.h /Y- /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-YU-Y_ %s
+// CHECK-YC-YU-Y_-NOT: -emit-pch
+// CHECK-YC-YU-Y_-NOT: -include-pch
+
+// Test computation of pch filename in various cases.
+
+// /Yu /Fpout.pch => out.pch is filename
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP1 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP1: -include-pch
+// CHECK-YUFP1: out.pch
+
+// /Yu /Fpout => out.pch is filename (.pch gets added if no extension present)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP2 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP2: -include-pch
+// CHECK-YUFP2: out.pch
+
+// /Yu /Fpout.bmp => out.bmp is filename (.pch not added when extension present)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpout.bmp /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP3 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP3: -include-pch
+// CHECK-YUFP3: out.bmp
+
+// /Yusub/dir.h => sub/dir.pch
+// RUN: %clang_cl -Werror /Yusub/pchfile.h /FIsub/pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP4 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP4: -include-pch
+// CHECK-YUFP4: sub/pchfile.pch
+
+// /Yudir.h /Isub => dir.pch
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Isub /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFP5 %s
+// Use .pch file, but don't build it.
+// CHECK-YUFP5: -include-pch
+// CHECK-YUFP5: pchfile.pch
+
+// FIXME: /Fpdir: use dir/VCx0.pch when dir is directory, where x is major MSVS
+// version in use.
+
+// Spot-check one use of /Fp with /Yc too, else trust the /Yu test cases above
+// also all assume to /Yc.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /Fpsub/file.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCFP %s
+// 1. Build .pch file.
+// CHECK-YCFP: cc1
+// CHECK-YCFP: -emit-pch
+// CHECK-YCFP: -o
+// CHECK-YCFP: sub/file.pch
+// 2. Use .pch file.
+// CHECK-YCFP: cc1
+// CHECK-YCFP: -emit-obj
+// CHECK-YCFP: -include-pch
+// CHECK-YCFP: sub/file.pch
+
+// /Ycfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h
+// => foo1 and foo2 go into pch, foo3 into main compilation
+// /Yc
+// RUN: %clang_cl -Werror /Ycfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCFIFIFI %s
+// 1. Build .pch file: Includes foo1.h (but NOT foo3.h) and compiles foo2.h
+// CHECK-YCFIFIFI: cc1
+// CHECK-YCFIFIFI: -emit-pch
+// CHECK-YCFIFIFI: -include
+// CHECK-YCFIFIFI: foo1.h
+// CHECK-YCFIFIFI-NOT: foo2.h
+// CHECK-YCFIFIFI-NOT: foo3.h
+// CHECK-YCFIFIFI: -o
+// CHECK-YCFIFIFI: foo2.pch
+// CHECK-YCFIFIFI: -x
+// CHECK-YCFIFIFI: "c++"
+// CHECK-YCFIFIFI: foo2.h
+// 2. Use .pch file: Inlucdes foo2.pch and foo3.h
+// CHECK-YCFIFIFI: cc1
+// CHECK-YCFIFIFI: -emit-obj
+// CHECK-YCFIFIFI-NOT: foo1.h
+// CHECK-YCFIFIFI-NOT: foo2.h
+// CHECK-YCFIFIFI: -include-pch
+// CHECK-YCFIFIFI: foo2.pch
+// CHECK-YCFIFIFI: -include
+// CHECK-YCFIFIFI: foo3.h
+
+// /Yucfoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h
+// => foo1 foo2 filtered out, foo3 into main compilation
+// RUN: %clang_cl -Werror /Yufoo2.h /FIfoo1.h /FIfoo2.h /FIfoo3.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YUFIFIFI %s
+// Use .pch file, but don't build it.
+// CHECK-YUFIFIFI-NOT: -emit-pch
+// CHECK-YUFIFIFI: cc1
+// CHECK-YUFIFIFI: -emit-obj
+// CHECK-YUFIFIFI-NOT: foo1.h
+// CHECK-YUFIFIFI-NOT: foo2.h
+// CHECK-YUFIFIFI: -include-pch
+// CHECK-YUFIFIFI: foo2.pch
+// CHECK-YUFIFIFI: -include
+// CHECK-YUFIFIFI: foo3.h
+
+// FIXME: Implement support for /Ycfoo.h / /Yufoo.h without /FIfoo.h
+// RUN: %clang_cl -Werror /Ycfoo.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-NOFI %s
+// CHECK-YC-NOFI: error: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+// RUN: %clang_cl -Werror /Yufoo.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-NOFI %s
+// CHECK-YU-NOFI: error: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// /Yc and /FI relative to /I paths...
+// The rules are:
+// Yu/Yc and FI parameter must match exactly, else it's not found
+// Must match literally exactly: /FI./foo.h /Ycfoo.h does _not_ work.
+// However, the path can be relative to /I paths.
+// FIXME: Update the error messages below once /FI is no longer required, but
+// these test cases all should stay failures as they fail with cl.exe.
+
+// Check that ./ isn't canonicalized away.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I1 %s
+// CHECK-YC-I1: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+
+// Check that ./ isn't canonicalized away.
+// RUN: %clang_cl -Werror /Yc./pchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I2 %s
+// CHECK-YC-I2: support for '/Yc' without a corresponding /FI flag not implemented yet; flag ignored
+
+// With an actual /I argument.
+// RUN: %clang_cl -Werror /Ifoo /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-I3 %s
+// 1. This writes pchfile.pch into the root dir, even if this will pick up
+//    foo/pchfile.h
+// CHECK-YC-I3: cc1
+// CHECK-YC-I3: -emit-pch
+// CHECK-YC-I3: -o
+// CHECK-YC-I3: pchfile.pch
+// 2. Use .pch file.
+// CHECK-YC-I3: cc1
+// CHECK-YC-I3: -emit-obj
+// CHECK-YC-I3: -include-pch
+// CHECK-YC-I3: pchfile.pch
+
+// Check that ./ isn't canonicalized away for /Yu either.
+// RUN: %clang_cl -Werror /Yupchfile.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-I1 %s
+// CHECK-YU-I1: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// But /FIfoo/bar.h /Ycfoo\bar.h does work, as does /FIfOo.h /Ycfoo.H
+// FIXME: This part isn't implemented yet. The following two tests should not
+// show an error but do regular /Yu handling.
+// RUN: %clang_cl -Werror /YupchFILE.h /FI./pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-CASE %s
+// CHECK-YU-CASE: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+// RUN: %clang_cl -Werror /Yu./pchfile.h /FI.\pchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-SLASH %s
+// CHECK-YU-SLASH: support for '/Yu' without a corresponding /FI flag not implemented yet; flag ignored
+
+// cl.exe warns on multiple /Yc, /Yu, /Fp arguments, but clang-cl silently just
+// uses the last one.  This is true for e.g. /Fo too, so not warning on this
+// is self-consistent with clang-cl's flag handling.
+
+// Interaction with /fallback
+
+// /Yc /fallback => /Yc not passed on (but /FI is)
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /Fpfoo.pch /fallback /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YC-FALLBACK %s
+// Note that in /fallback builds, if creation of the pch fails the main compile
+// does still run so that /fallback can have an effect (this part is not tested)
+// CHECK-YC-FALLBACK: cc1
+// CHECK-YC-FALLBACK: -emit-obj
+// CHECK-YC-FALLBACK: -include-pch
+// CHECK-YC-FALLBACK: foo.pch
+// CHECK-YC-FALLBACK: ||
+// CHECK-YC-FALLBACK: cl.exe
+// CHECK-YC-FALLBACK-NOT: -include-pch
+// CHECK-YC-FALLBACK-NOT: /Ycpchfile.h
+// CHECK-YC-FALLBACK: /FIpchfile.h
+// CHECK-YC-FALLBACK-NOT: /Fpfoo.pch
+
+// /Yu /fallback => /Yu not passed on (but /FI is)
+// RUN: %clang_cl -Werror /Yupchfile.h /FIpchfile.h /Fpfoo.pch /fallback /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YU-FALLBACK %s
+// CHECK-YU-FALLBACK-NOT: -emit-pch
+// CHECK-YU-FALLBACK: cc1
+// CHECK-YU-FALLBACK: -emit-obj
+// CHECK-YU-FALLBACK: -include-pch
+// CHECK-YU-FALLBACK: foo.pch
+// CHECK-YU-FALLBACK: ||
+// CHECK-YU-FALLBACK: cl.exe
+// CHECK-YU-FALLBACK-NOT: -include-pch
+// CHECK-YU-FALLBACK-NOT: /Yupchfile.h
+// CHECK-YU-FALLBACK: /FIpchfile.h
+// CHECK-YU-FALLBACK-NOT: /Fpfoo.pch
+
+// /FI without /Yu => pch file not used, even if it exists (different from
+// -include, which picks up .gch files if they exist).
+// RUN: touch %t.pch
+// RUN: %clang_cl -Werror /FI%t.pch /Fp%t.pch /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-FI %s
+// CHECK-FI-NOT: -include-pch
+// CHECK-FI: -include
+
+// Test interaction of /Yc with language mode flags.
+
+// If /TC changes the input language to C, a c pch file should be produced.
+// RUN: %clang_cl /TC -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTC %s
+// CHECK-YCTC: cc1
+// CHECK-YCTC: -emit-pch
+// CHECK-YCTC: -o
+// CHECK-YCTC: pchfile.pch
+// CHECK-YCTC: -x
+// CHECK-YCTC: "c"
+
+// Also check lower-case /Tc variant.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### /Tc%s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-YCTc %s
+// CHECK-YCTc: cc1
+// CHECK-YCTc: -emit-pch
+// CHECK-YCTc: -o
+// CHECK-YCTc: pchfile.pch
+// CHECK-YCTc: -x
+// CHECK-YCTc: "c"
+
+// Don't crash when a non-source file is passed.
+// RUN: %clang_cl -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %S/Inputs/file.prof 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NoSource %s
+// CHECK-NoSource: file.prof:{{.*}}input unused
+
+// ...but if an explicit file turns the file into a source file, handle it:
+// RUN: %clang_cl /TP -Werror /Ycpchfile.h /FIpchfile.h /c -### -- %S/Inputs/file.prof 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NoSourceTP %s
+// CHECK-NoSourceTP: cc1
+// CHECK-NoSourceTP: -emit-pch
+// CHECK-NoSourceTP: -o
+// CHECK-NoSourceTP: pchfile.pch
+// CHECK-NoSourceTP: -x
+// CHECK-NoSourceTP: "c++"
diff --git a/test/Driver/cl-runtime-flags.c b/test/Driver/cl-runtime-flags.c
index a54aa1a14aee3..3fa036d20199e 100644
--- a/test/Driver/cl-runtime-flags.c
+++ b/test/Driver/cl-runtime-flags.c
@@ -13,6 +13,7 @@
 // CHECK-MT-NOT: "-D_DEBUG"
 // CHECK-MT: "-D_MT"
 // CHECK-MT-NOT: "-D_DLL"
+// CHECK-MT: "-flto-visibility-public-std"
 // CHECK-MT: "--dependent-lib=libcmt"
 // CHECK-MT: "--dependent-lib=oldnames"
 
@@ -21,6 +22,7 @@
 // CHECK-MTd: "-D_DEBUG"
 // CHECK-MTd: "-D_MT"
 // CHECK-MTd-NOT: "-D_DLL"
+// CHECK-MTd: "-flto-visibility-public-std"
 // CHECK-MTd: "--dependent-lib=libcmtd"
 // CHECK-MTd: "--dependent-lib=oldnames"
 
diff --git a/test/Driver/clang-translation.c b/test/Driver/clang-translation.c
index 422aa139346d9..cc3cd1b58b1f4 100644
--- a/test/Driver/clang-translation.c
+++ b/test/Driver/clang-translation.c
@@ -245,9 +245,20 @@
 // RUN: FileCheck -check-prefix=MIPSEL-ANDROID %s
 // MIPSEL-ANDROID: clang
 // MIPSEL-ANDROID: "-cc1"
-// MIPSEL-ANDROID: "-target-cpu" "mips32r2"
+// MIPSEL-ANDROID: "-target-cpu" "mips32"
+// MIPSEL-ANDROID: "-target-feature" "+fpxx"
+// MIPSEL-ANDROID: "-target-feature" "+nooddspreg"
 // MIPSEL-ANDROID: "-mfloat-abi" "hard"
 
+// RUN: %clang -target mipsel-linux-android -### -S %s -mcpu=mips32r6 2>&1 | \
+// RUN: FileCheck -check-prefix=MIPSEL-ANDROID-R6 %s
+// MIPSEL-ANDROID-R6: clang
+// MIPSEL-ANDROID-R6: "-cc1"
+// MIPSEL-ANDROID-R6: "-target-cpu" "mips32r6"
+// MIPSEL-ANDROID-R6: "-target-feature" "+fp64"
+// MIPSEL-ANDROID-R6: "-target-feature" "+nooddspreg"
+// MIPSEL-ANDROID-R6: "-mfloat-abi" "hard"
+
 // RUN: %clang -target mips64-linux-gnu -### -S %s 2>&1 | \
 // RUN: FileCheck -check-prefix=MIPS64 %s
 // MIPS64: clang
diff --git a/test/Driver/clang_f_opts.c b/test/Driver/clang_f_opts.c
index 25a1930bdd6d5..ea5b8d1ecd526 100644
--- a/test/Driver/clang_f_opts.c
+++ b/test/Driver/clang_f_opts.c
@@ -66,7 +66,7 @@
 // CHECK-PROFILE-ARCS: "-femit-coverage-data"
 // CHECK-NO-PROFILE-ARCS-NOT: "-femit-coverage-data"
 
-// RUN: %clang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE %s
+// RUN: %clang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s
 // RUN: %clang -### -S -fprofile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE %s
 // RUN: %clang -### -S -fprofile-generate=/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-DIR %s
 // RUN: %clang -### -S -fprofile-instr-generate=/tmp/somefile.profraw %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-FILE %s
@@ -87,9 +87,9 @@
 // RUN: %clang -### -S -fprofile-generate=dir -fprofile-instr-use %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // RUN: %clang -### -S -fprofile-generate=dir -fprofile-instr-use=file %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GEN-USE %s
 // RUN: %clang -### -S -fprofile-instr-generate=file -fno-profile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
-// RUN: %clang -### -S -fprofile-instr-generate=file -fno-profile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
+// RUN: %clang -### -S -fprofile-instr-generate -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GENERATE %s
+// RUN: %clang -### -S -fprofile-instr-generate -fprofile-generate=file %s 2>&1 | FileCheck -check-prefix=CHECK-NO-MIX-GENERATE %s
 // RUN: %clang -### -S -fprofile-generate=dir -fno-profile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
-// RUN: %clang -### -S -fprofile-generate=dir -fno-profile-instr-generate %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-GEN %s
 // RUN: %clang -### -S -fprofile-instr-use=file -fno-profile-instr-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
 // RUN: %clang -### -S -fprofile-instr-use=file -fno-profile-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
 // RUN: %clang -### -S -fprofile-use=file -fno-profile-use %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-USE %s
@@ -97,11 +97,13 @@
 // RUN: %clang -### -S -fcoverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-COVERAGE-AND-GEN %s
 // RUN: %clang -### -S -fcoverage-mapping -fno-coverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-COVERAGE %s
 // RUN: %clang -### -S -fprofile-instr-generate -fcoverage-mapping -fno-coverage-mapping %s 2>&1 | FileCheck -check-prefix=CHECK-DISABLE-COVERAGE %s
-// CHECK-PROFILE-GENERATE: "-fprofile-instr-generate"
-// CHECK-PROFILE-GENERATE-DIR: "-fprofile-instr-generate=/some/dir{{/|\\\\}}default.profraw"
-// CHECK-PROFILE-GENERATE-FILE: "-fprofile-instr-generate=/tmp/somefile.profraw"
+// CHECK-PROFILE-GENERATE: "-fprofile-instrument=clang"
+// CHECK-PROFILE-GENERATE-LLVM: "-fprofile-instrument=llvm"
+// CHECK-PROFILE-GENERATE-DIR: "-fprofile-instrument-path=/some/dir{{/|\\\\}}default.profraw"
+// CHECK-PROFILE-GENERATE-FILE: "-fprofile-instrument-path=/tmp/somefile.profraw"
 // CHECK-NO-MIX-GEN-USE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
-// CHECK-DISABLE-GEN-NOT: "-fprofile-instr-generate"
+// CHECK-NO-MIX-GENERATE: '{{[a-z=-]*}}' not allowed with '{{[a-z=-]*}}'
+// CHECK-DISABLE-GEN-NOT: "-fprofile-instrument=clang"
 // CHECK-DISABLE-USE-NOT: "-fprofile-instr-use"
 // CHECK-COVERAGE-AND-GEN: '-fcoverage-mapping' only allowed with '-fprofile-instr-generate'
 // CHECK-DISABLE-COVERAGE-NOT: "-fcoverage-mapping"
@@ -111,9 +113,9 @@
 // RUN: mkdir -p %t.d/some/dir
 // RUN: %clang -### -S -fprofile-use=%t.d/some/dir %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s
 // RUN: %clang -### -S -fprofile-instr-use=/tmp/somefile.prof %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-FILE %s
-// CHECK-PROFILE-USE: "-fprofile-instr-use=default.profdata"
-// CHECK-PROFILE-USE-DIR: "-fprofile-instr-use={{.*}}.d/some/dir{{/|\\\\}}default.profdata"
-// CHECK-PROFILE-USE-FILE: "-fprofile-instr-use=/tmp/somefile.prof"
+// CHECK-PROFILE-USE: "-fprofile-instrument-use-path=default.profdata"
+// CHECK-PROFILE-USE-DIR: "-fprofile-instrument-use-path={{.*}}.d/some/dir{{/|\\\\}}default.profdata"
+// CHECK-PROFILE-USE-FILE: "-fprofile-instrument-use-path=/tmp/somefile.prof"
 
 // RUN: %clang -### -S -fvectorize %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
 // RUN: %clang -### -S -fno-vectorize -fvectorize %s 2>&1 | FileCheck -check-prefix=CHECK-VECTORIZE %s
@@ -285,7 +287,6 @@
 // RUN: -fexpensive-optimizations                                             \
 // RUN: -fno-expensive-optimizations                                          \
 // RUN: -fno-defer-pop                                                        \
-// RUN: -finline-functions                                                    \
 // RUN: -fkeep-inline-functions                                               \
 // RUN: -fno-keep-inline-functions                                            \
 // RUN: -freorder-blocks                                                      \
@@ -353,7 +354,6 @@
 // CHECK-WARNING-DAG: optimization flag '-fexpensive-optimizations' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-expensive-optimizations' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-defer-pop' is not supported
-// CHECK-WARNING-DAG: optimization flag '-finline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fkeep-inline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-fno-keep-inline-functions' is not supported
 // CHECK-WARNING-DAG: optimization flag '-freorder-blocks' is not supported
diff --git a/test/Driver/cloudabi.c b/test/Driver/cloudabi.c
index 99a2bc24f6534..7a72f6189515b 100644
--- a/test/Driver/cloudabi.c
+++ b/test/Driver/cloudabi.c
@@ -1,3 +1,8 @@
-// RUN: %clang %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s
-// CHECK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
-// CHECK: "-Bstatic" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
+// RUN: %clang %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=SAFESTACK
+// SAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// SAFESTACK: "-Bstatic" "-pie" "--no-dynamic-linker" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
+
+// RUN: %clang %s -### -target x86_64-unknown-cloudabi -fno-sanitize=safe-stack 2>&1 | FileCheck %s -check-prefix=NOSAFESTACK
+// NOSAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
+// NOSAFESTACK-NOT: "-fsanitize=safe-stack"
+// NOSAFESTACK: "-Bstatic" "-pie" "--no-dynamic-linker" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc" "-lcompiler_rt" "crtend.o"
diff --git a/test/Driver/cloudabi.cpp b/test/Driver/cloudabi.cpp
index c3b68ae88099c..dd52ac36acc81 100644
--- a/test/Driver/cloudabi.cpp
+++ b/test/Driver/cloudabi.cpp
@@ -1,3 +1,8 @@
-// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s
-// CHECK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
-// CHECK: "-Bstatic" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
+// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi 2>&1 | FileCheck %s -check-prefix=SAFESTACK
+// SAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections" {{.*}} "-fsanitize=safe-stack"
+// SAFESTACK: "-Bstatic" "-pie" "--no-dynamic-linker" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
+
+// RUN: %clangxx %s -### -target x86_64-unknown-cloudabi -fno-sanitize=safe-stack 2>&1 | FileCheck %s -check-prefix=NOSAFESTACk
+// NOSAFESTACK: "-cc1" "-triple" "x86_64-unknown-cloudabi" {{.*}} "-ffunction-sections" "-fdata-sections"
+// NOSAFESTACK-NOT: "-fsanitize=safe-stack"
+// NOSAFESTACk: "-Bstatic" "-pie" "--no-dynamic-linker" "-zrelro" "--eh-frame-hdr" "--gc-sections" "-o" "a.out" "crt0.o" "crtbegin.o" "{{.*}}" "{{.*}}" "-lc++" "-lc++abi" "-lunwind" "-lc" "-lcompiler_rt" "crtend.o"
diff --git a/test/Driver/cuda-arch-translation.cu b/test/Driver/cuda-arch-translation.cu
new file mode 100644
index 0000000000000..64ddb3178cd24
--- /dev/null
+++ b/test/Driver/cuda-arch-translation.cu
@@ -0,0 +1,37 @@
+// Tests that "sm_XX" gets correctly converted to "compute_YY" when we invoke
+// fatbinary.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// CHECK:fatbinary
+
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_21 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM21 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM30 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_32 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM32 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_37 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM37 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_50 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM50 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_52 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM52 %s
+// RUN: %clang -### -target x86_64-linux-gnu -c --cuda-gpu-arch=sm_53 %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM53 %s
+
+// SM20:--image=profile=sm_20{{.*}}--image=profile=compute_20
+// SM21:--image=profile=sm_21{{.*}}--image=profile=compute_20
+// SM30:--image=profile=sm_30{{.*}}--image=profile=compute_30
+// SM32:--image=profile=sm_32{{.*}}--image=profile=compute_32
+// SM35:--image=profile=sm_35{{.*}}--image=profile=compute_35
+// SM37:--image=profile=sm_37{{.*}}--image=profile=compute_37
+// SM50:--image=profile=sm_50{{.*}}--image=profile=compute_50
+// SM52:--image=profile=sm_52{{.*}}--image=profile=compute_52
+// SM53:--image=profile=sm_53{{.*}}--image=profile=compute_53
diff --git a/test/Driver/cuda-constructor-alias.cu b/test/Driver/cuda-constructor-alias.cu
new file mode 100644
index 0000000000000..e0fd329abf627
--- /dev/null
+++ b/test/Driver/cuda-constructor-alias.cu
@@ -0,0 +1,13 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Check that we don't pass -mconstructor-aliases to CUDA device-side
+// compilation, but we do pass it to host-side compilation.
+
+// RUN: %clang -### -target x86_64-linux-gnu %s 2>&1 | FileCheck %s
+// CHECK: "-cc1"
+// CHECK-NOT: "-fcuda-is-device" {{.*}}"-mconstructor-aliases"
+// CHECK-NOT: "-mconstructor-aliases" {{.*}}"-fcuda-is-device"
+// CHECK: "-cc1"
+// CHECK-SAME: "-mconstructor-aliases"
diff --git a/test/Driver/cuda-external-tools.cu b/test/Driver/cuda-external-tools.cu
new file mode 100644
index 0000000000000..280c60966bd0e
--- /dev/null
+++ b/test/Driver/cuda-external-tools.cu
@@ -0,0 +1,103 @@
+// Tests that ptxas and fatbinary are correctly during CUDA compilation.
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// Regular compiles with -O{0,1,2,3,4,fast}.  -O4 and -Ofast map to ptxas O3.
+// RUN: %clang -### -target x86_64-linux-gnu -O0 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O1 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT1 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O3 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+// RUN: %clang -### -target x86_64-linux-gnu -O4 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+// RUN: %clang -### -target x86_64-linux-gnu -Ofast -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT3 %s
+
+// With debugging enabled, ptxas should be run with with no ptxas optimizations.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-device-debug -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix DBG %s
+
+// --no-cuda-noopt-device-debug overrides --cuda-noopt-device-debug.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-noopt-debug \
+// RUN:   --no-cuda-noopt-debug -O2 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile without -O.  This should result in us passing -O0 to ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Regular compiles with -Os and -Oz.  For lack of a better option, we map
+// these to ptxas -O3.
+// RUN: %clang -### -target x86_64-linux-gnu -Os -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+// RUN: %clang -### -target x86_64-linux-gnu -Oz -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT2 %s
+
+// Regular compile targeting sm_35.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-gpu-arch=sm_35 -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM35 %s
+
+// 32-bit compile.
+// RUN: %clang -### -target x86_32-linux-gnu -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH32 -check-prefix SM20 %s
+
+// Compile with -fintegrated-as.  This should still cause us to invoke ptxas.
+// RUN: %clang -### -target x86_64-linux-gnu -fintegrated-as -c %s 2>&1 \
+// RUN: | FileCheck -check-prefix ARCH64 -check-prefix SM20 -check-prefix OPT0 %s
+
+// Check -Xcuda-ptxas and -Xcuda-fatbinary
+// RUN: %clang -### -target x86_64-linux-gnu -c -Xcuda-ptxas -foo1 \
+// RUN:   -Xcuda-fatbinary -bar1 -Xcuda-ptxas -foo2 -Xcuda-fatbinary -bar2 %s 2>&1 \
+// RUN: | FileCheck -check-prefix SM20 -check-prefix PTXAS-EXTRA \
+// RUN:   -check-prefix FATBINARY-EXTRA %s
+
+// Match clang job that produces PTX assembly.
+// CHECK: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20: "-target-cpu" "sm_20"
+// SM35: "-target-cpu" "sm_35"
+// SM20: "-o" "[[PTXFILE:[^"]*]]"
+// SM35: "-o" "[[PTXFILE:[^"]*]]"
+
+// Match the call to ptxas (which assembles PTX to SASS).
+// CHECK:ptxas
+// ARCH64: "-m64"
+// ARCH32: "-m32"
+// OPT0: "-O0"
+// OPT0-NOT: "-g"
+// OPT1: "-O1"
+// OPT1-NOT: "-g"
+// OPT2: "-O2"
+// OPT2-NOT: "-g"
+// OPT3: "-O3"
+// OPT3-NOT: "-g"
+// DBG: "-g" "--dont-merge-basicblocks" "--return-at-end"
+// SM20: "--gpu-name" "sm_20"
+// SM35: "--gpu-name" "sm_35"
+// SM20: "--output-file" "[[CUBINFILE:[^"]*]]"
+// SM35: "--output-file" "[[CUBINFILE:[^"]*]]"
+// PTXAS-EXTRA: "-foo1"
+// PTXAS-EXTRA-SAME: "-foo2"
+// CHECK-SAME: "[[PTXFILE]]"
+
+// Match the call to fatbinary (which combines all our PTX and SASS into one
+// blob).
+// CHECK:fatbinary
+// CHECK-DAG: "--cuda"
+// ARCH64-DAG: "-64"
+// ARCH32-DAG: "-32"
+// CHECK-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// SM20-DAG: "--image=profile=compute_20,file=[[PTXFILE]]"
+// SM35-DAG: "--image=profile=compute_35,file=[[PTXFILE]]"
+// SM20-DAG: "--image=profile=sm_20,file=[[CUBINFILE]]"
+// SM35-DAG: "--image=profile=sm_35,file=[[CUBINFILE]]"
+// FATBINARY-EXTRA: "-bar1"
+// FATBINARY-EXTRA-SAME: "-bar2"
+
+// Match the clang job for host compilation.
+// CHECK: "-cc1" "-triple" "x86_64--linux-gnu"
+// CHECK-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
diff --git a/test/Driver/cuda-march.cu b/test/Driver/cuda-march.cu
new file mode 100644
index 0000000000000..123b6617e0a5c
--- /dev/null
+++ b/test/Driver/cuda-march.cu
@@ -0,0 +1,22 @@
+// Checks that cuda compilation does the right thing when passed -march.
+// (Specifically, we want to pass it to host compilation, but not to device
+// compilation or ptxas!)
+//
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c \
+// RUN: -march=haswell %s 2>&1 | FileCheck %s
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-linux-gnu -c \
+// RUN: -march=haswell --cuda-gpu-arch=sm_20 %s 2>&1 | FileCheck %s
+
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-SAME: "-triple" "nvptx
+// CHECK-SAME: "-target-cpu" "sm_20"
+
+// CHECK: ptxas
+// CHECK-SAME: "--gpu-name" "sm_20"
+
+// CHECK: {{.*}}clang{{.*}}" "-cc1"
+// CHECK-SAME: "-target-cpu" "haswell"
diff --git a/test/Driver/cuda-not-found.cu b/test/Driver/cuda-not-found.cu
new file mode 100644
index 0000000000000..b63623ae56c1f
--- /dev/null
+++ b/test/Driver/cuda-not-found.cu
@@ -0,0 +1,12 @@
+// REQUIRES: clang-driver
+
+// Check that we raise an error if we're trying to compile CUDA code but can't
+// find a CUDA install, unless -nocudainc was passed.
+
+// RUN: %clang -### --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
+// RUN: %clang -### --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix ERR
+// ERR: cannot find CUDA installation
+
+// RUN: %clang -### -nocudainc --sysroot=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
+// RUN: %clang -### -nocudainc --cuda-path=%s/no-cuda-there %s 2>&1 | FileCheck %s --check-prefix OK
+// OK-NOT: cannot find CUDA installation
diff --git a/test/Driver/cuda-options.cu b/test/Driver/cuda-options.cu
index bf71633a4cf93..5d650761fb228 100644
--- a/test/Driver/cuda-options.cu
+++ b/test/Driver/cuda-options.cu
@@ -22,29 +22,45 @@
 // RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
 // RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
 
-// Same test as above, but with preceeding --cuda-device-only to make sure only
-// the last option has an effect.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only --cuda-host-only %s 2>&1 \
+// Verify that --cuda-device-only disables host-side compilation and linking.
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
+// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
+// RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
+
+// Check that the last of --cuda-compile-host-device, --cuda-host-only, and
+// --cuda-device-only wins.
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
+// RUN:    --cuda-host-only %s 2>&1 \
 // RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
 // RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
 
-// Verify that --cuda-device-only disables host-side compilation and linking.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device \
+// RUN:    --cuda-host-only %s 2>&1 \
+// RUN: | FileCheck -check-prefix NODEVICE -check-prefix HOST \
+// RUN:    -check-prefix NOINCLUDES-DEVICE -check-prefix LINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only \
+// RUN:    --cuda-device-only %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
-// Same test as above, but with preceeding --cuda-host-only to make sure only
-// the last option has an effect.
-// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only --cuda-device-only %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device \
+// RUN:    --cuda-device-only %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix NOHOST -check-prefix NOLINK %s
 
-// Verify that with -S we compile host and device sides to assembly and
-// incorporate device code into the host side.
-// RUN: %clang -### -target x86_64-linux-gnu -S -c %s 2>&1 \
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only \
+// RUN:   --cuda-compile-host-device %s 2>&1 \
 // RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
 // RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix NOLINK %s
+// RUN:    -check-prefix LINK %s
+
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only \
+// RUN:   --cuda-compile-host-device %s 2>&1 \
+// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \
+// RUN:    -check-prefix HOST -check-prefix INCLUDES-DEVICE \
+// RUN:    -check-prefix LINK %s
 
 // Verify that --cuda-gpu-arch option passes the correct GPU archtecture to
 // device compilation.
@@ -61,7 +77,7 @@
 // RUN:    -check-prefix DEVICE2 -check-prefix DEVICE-SM35 \
 // RUN:    -check-prefix DEVICE2-SM30 -check-prefix HOST \
 // RUN:    -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \
-// RUN:    -check-prefix INCLUDES-DEVICE2 -check-prefix NOLINK %s
+// RUN:    -check-prefix NOLINK %s
 
 // Verify that device-side results are passed to the correct tool when
 // -save-temps is used.
@@ -92,10 +108,16 @@
 // DEVICE-NOSAVE-SAME: "-aux-triple" "x86_64--linux-gnu"
 // DEVICE-SAME: "-fcuda-is-device"
 // DEVICE-SM35-SAME: "-target-cpu" "sm_35"
-// DEVICE-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
+// DEVICE-SAME: "-o" "[[PTXFILE:[^"]*]]"
 // DEVICE-NOSAVE-SAME: "-x" "cuda"
 // DEVICE-SAVE-SAME: "-x" "ir"
 
+// Match the call to ptxas (which assembles PTX to SASS).
+// DEVICE:ptxas
+// DEVICE-SM35-DAG: "--gpu-name" "sm_35"
+// DEVICE-DAG: "--output-file" "[[CUBINFILE:[^"]*]]"
+// DEVICE-DAG: "[[PTXFILE]]"
+
 // Match another device-side compilation.
 // DEVICE2: "-cc1" "-triple" "nvptx64-nvidia-cuda"
 // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu"
@@ -106,23 +128,27 @@
 
 // Match no device-side compilation.
 // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda"
-// NODEVICE-SAME-NOT: "-fcuda-is-device"
+// NODEVICE-NOT: "-fcuda-is-device"
+
+// INCLUDES-DEVICE:fatbinary
+// INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]"
+// INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]"
 
 // Match host-side preprocessor job with -save-temps.
 // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAVE-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
-// HOST-SAVE-SAME-NOT: "-fcuda-is-device"
+// HOST-SAVE-NOT: "-fcuda-is-device"
 // HOST-SAVE-SAME: "-x" "cuda"
 
 // Match host-side compilation.
 // HOST: "-cc1" "-triple" "x86_64--linux-gnu"
 // HOST-SAME: "-aux-triple" "nvptx64-nvidia-cuda"
-// HOST-SAME-NOT: "-fcuda-is-device"
+// HOST-NOT: "-fcuda-is-device"
 // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]"
 // HOST-NOSAVE-SAME: "-x" "cuda"
 // HOST-SAVE-SAME: "-x" "cuda-cpp-output"
-// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
-// INCLUDES-DEVICE2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
+// INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]"
 
 // Match external assembler that uses compilation output.
 // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"
@@ -132,7 +158,7 @@
 
 // Match no host compilation.
 // NOHOST-NOT: "-cc1" "-triple"
-// NOHOST-SAME-NOT: "-x" "cuda"
+// NOHOST-NOT: "-x" "cuda"
 
 // Match linker.
 // LINK: "{{.*}}{{ld|link}}{{(.exe)?}}"
diff --git a/test/Driver/cuda-output-asm.cu b/test/Driver/cuda-output-asm.cu
new file mode 100644
index 0000000000000..af62478ac27e7
--- /dev/null
+++ b/test/Driver/cuda-output-asm.cu
@@ -0,0 +1,29 @@
+// Tests CUDA compilation with -S.
+
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -### -S -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix HOST -check-prefix SM20 %s
+// RUN: %clang -### -S -target x86_64-linux-gnu --cuda-host-only -o foo.s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix HOST %s
+// RUN: %clang -### -S -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 \
+// RUN:   --cuda-device-only -o foo.s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix SM20 %s
+// RUN: %clang -### -S -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 \
+// RUN:   --cuda-gpu-arch=sm_30 --cuda-device-only %s 2>&1 \
+// RUN:   | FileCheck -check-prefix SM20 -check-prefix SM30 %s
+
+// HOST-DAG: "-cc1" "-triple" "x86_64--linux-gnu"
+// SM20-DAG: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM20-same: "-target-cpu" "sm_20"
+// SM30-DAG: "-cc1" "-triple" "nvptx64-nvidia-cuda"
+// SM30-same: "-target-cpu" "sm_30"
+
+// RUN: %clang -### -S -target x86_64-linux-gnu -o foo.s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
+// RUN: %clang -### -S -target x86_64-linux-gnu --cuda-device-only \
+// RUN:   --cuda-gpu-arch=sm_20 --cuda-gpu-arch=sm_30 -o foo.s %s 2>&1 \
+// RUN:   | FileCheck -check-prefix MULTIPLE-OUTPUT-FILES %s
+// MULTIPLE-OUTPUT-FILES: error: cannot specify -o when generating multiple output files
diff --git a/test/Driver/cuda-unused-arg-warning.cu b/test/Driver/cuda-unused-arg-warning.cu
index e8daad6cdaf6b..cbbb893129b6f 100644
--- a/test/Driver/cuda-unused-arg-warning.cu
+++ b/test/Driver/cuda-unused-arg-warning.cu
@@ -4,11 +4,16 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// --cuda-host-only should never trigger unused arg warning.
+// --cuda-host-only and --cuda-compile-host-device should never trigger an
+// unused arg warning.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -c %s 2>&1 | \
 // RUN:    FileCheck %s
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-host-only -x c -c %s 2>&1 | \
 // RUN:    FileCheck %s
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device -c %s 2>&1 | \
+// RUN:    FileCheck %s
+// RUN: %clang -### -target x86_64-linux-gnu --cuda-compile-host-device -x c -c %s 2>&1 | \
+// RUN:    FileCheck %s
 
 // --cuda-device-only should warn during non-CUDA compilation.
 // RUN: %clang -### -target x86_64-linux-gnu --cuda-device-only -x c -c %s 2>&1 | \
@@ -19,5 +24,6 @@
 // RUN:    FileCheck -check-prefix NO-UNUSED-WARNING %s
 
 // CHECK-NOT: warning: argument unused during compilation: '--cuda-host-only'
+// CHECK-NOT: warning: argument unused during compilation: '--cuda-compile-host-device'
 // UNUSED-WARNING: warning: argument unused during compilation: '--cuda-device-only'
 // NO-UNUSED-WARNING-NOT: warning: argument unused during compilation: '--cuda-device-only'
diff --git a/test/Driver/cuda-version-check.cu b/test/Driver/cuda-version-check.cu
new file mode 100644
index 0000000000000..65bdd16f96ded
--- /dev/null
+++ b/test/Driver/cuda-version-check.cu
@@ -0,0 +1,51 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+// RUN: %clang -v -### --cuda-gpu-arch=sm_20 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA_80 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// The installation at Inputs/CUDA is CUDA 7.0, which doesn't support sm_60.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60
+
+// This should only complain about sm_60, not sm_35.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_35 \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=OK_SM35
+
+// We should get two errors here, one for sm_60 and one for sm_61.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --cuda-gpu-arch=sm_61 \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60 --check-prefix=ERR_SM61
+
+// We should still get an error if we pass -nocudainc, because this compilation
+// would invoke ptxas, and we do a version check on that, too.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 -nocudainc --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=ERR_SM60
+
+// If with -nocudainc and -E, we don't touch the CUDA install, so we
+// shouldn't get an error.
+// RUN: %clang -v -### -E --cuda-device-only --cuda-gpu-arch=sm_60 -nocudainc \
+// RUN:    --sysroot=%S/Inputs/CUDA 2>&1 %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// --no-cuda-version-check should suppress all of these errors.
+// RUN: %clang -v -### --cuda-gpu-arch=sm_60 --sysroot=%S/Inputs/CUDA 2>&1 \
+// RUN:    --no-cuda-version-check %s | \
+// RUN:    FileCheck %s --check-prefix=OK
+
+// OK-NOT: error: GPU arch
+
+// OK_SM35-NOT: error: GPU arch sm_35
+
+// We should only get one error per architecture.
+// ERR_SM60: error: GPU arch sm_60 {{.*}}
+// ERR_SM60-NOT: error: GPU arch sm_60
+
+// ERR_SM61: error: GPU arch sm_61 {{.*}}
+// ERR_SM61-NOT: error: GPU arch sm_61
diff --git a/test/Driver/cuda_phases.cu b/test/Driver/cuda_phases.cu
new file mode 100644
index 0000000000000..6cfb61aba72a1
--- /dev/null
+++ b/test/Driver/cuda_phases.cu
@@ -0,0 +1,206 @@
+// Tests the phases generated for a CUDA offloading target for different
+// combinations of:
+// - Number of gpu architectures;
+// - Host/device-only compilation;
+// - User-requested final phase - binary or assembly.
+
+// REQUIRES: clang-driver
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+//
+// Test single gpu architecture with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN %s
+// BIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN: 2: compiler, {1}, ir, (host-cuda)
+// BIN: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN: 10: linker, {8, 9}, cuda-fatbin, (device-cuda)
+// BIN: 11: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {10}, ir
+// BIN: 12: backend, {11}, assembler, (host-cuda)
+// BIN: 13: assembler, {12}, object, (host-cuda)
+// BIN: 14: linker, {13}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM %s
+// ASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM: 5: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM: 6: preprocessor, {5}, cuda-cpp-output, (host-cuda)
+// ASM: 7: compiler, {6}, ir, (host-cuda)
+// ASM: 8: backend, {7}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s 2>&1 \
+// RUN: | FileCheck -check-prefix=BIN2 %s
+// BIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// BIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// BIN2: 2: compiler, {1}, ir, (host-cuda)
+// BIN2: 3: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// BIN2: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_30)
+// BIN2: 5: compiler, {4}, ir, (device-cuda, sm_30)
+// BIN2: 6: backend, {5}, assembler, (device-cuda, sm_30)
+// BIN2: 7: assembler, {6}, object, (device-cuda, sm_30)
+// BIN2: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {7}, object
+// BIN2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {6}, assembler
+// BIN2: 10: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// BIN2: 11: preprocessor, {10}, cuda-cpp-output, (device-cuda, sm_35)
+// BIN2: 12: compiler, {11}, ir, (device-cuda, sm_35)
+// BIN2: 13: backend, {12}, assembler, (device-cuda, sm_35)
+// BIN2: 14: assembler, {13}, object, (device-cuda, sm_35)
+// BIN2: 15: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {14}, object
+// BIN2: 16: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {13}, assembler
+// BIN2: 17: linker, {8, 9, 15, 16}, cuda-fatbin, (device-cuda)
+// BIN2: 18: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {17}, ir
+// BIN2: 19: backend, {18}, assembler, (host-cuda)
+// BIN2: 20: assembler, {19}, object, (host-cuda)
+// BIN2: 21: linker, {20}, image, (host-cuda)
+
+//
+// Test two gpu architecturess up to the assemble phase.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s -S 2>&1 \
+// RUN: | FileCheck -check-prefix=ASM2 %s
+// ASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// ASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// ASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// ASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// ASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// ASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// ASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// ASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// ASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// ASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
+// ASM2: 10: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// ASM2: 11: preprocessor, {10}, cuda-cpp-output, (host-cuda)
+// ASM2: 12: compiler, {11}, ir, (host-cuda)
+// ASM2: 13: backend, {12}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN %s
+// HBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN: 2: compiler, {1}, ir, (host-cuda)
+// HBIN: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN: 4: backend, {3}, assembler, (host-cuda)
+// HBIN: 5: assembler, {4}, object, (host-cuda)
+// HBIN: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test single gpu architecture up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM %s
+// HASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM: 2: compiler, {1}, ir, (host-cuda)
+// HASM: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test two gpu architectures with complete compilation in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only 2>&1 \
+// RUN: | FileCheck -check-prefix=HBIN2 %s
+// HBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HBIN2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HBIN2: 2: compiler, {1}, ir, (host-cuda)
+// HBIN2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HBIN2: 4: backend, {3}, assembler, (host-cuda)
+// HBIN2: 5: assembler, {4}, object, (host-cuda)
+// HBIN2: 6: linker, {5}, image, (host-cuda)
+
+//
+// Test two gpu architectures up to the assemble phase in host-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-host-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=HASM2 %s
+// HASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (host-cuda)
+// HASM2: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
+// HASM2: 2: compiler, {1}, ir, (host-cuda)
+// HASM2: 3: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, ir
+// HASM2: 4: backend, {3}, assembler, (host-cuda)
+
+//
+// Test single gpu architecture with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN %s
+// DBIN: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+
+//
+// Test single gpu architecture up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM %s
+// DASM: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+
+//
+// Test two gpu architectures with complete compilation in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only 2>&1 \
+// RUN: | FileCheck -check-prefix=DBIN2 %s
+// DBIN2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DBIN2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DBIN2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DBIN2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DBIN2: 4: assembler, {3}, object, (device-cuda, sm_30)
+// DBIN2: 5: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {4}, object
+// DBIN2: 6: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DBIN2: 7: preprocessor, {6}, cuda-cpp-output, (device-cuda, sm_35)
+// DBIN2: 8: compiler, {7}, ir, (device-cuda, sm_35)
+// DBIN2: 9: backend, {8}, assembler, (device-cuda, sm_35)
+// DBIN2: 10: assembler, {9}, object, (device-cuda, sm_35)
+// DBIN2: 11: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {10}, object
+
+//
+// Test two gpu architectures up to the assemble phase in device-only
+// compilation mode.
+//
+// RUN: %clang -target powerpc64le-ibm-linux-gnu -ccc-print-phases --cuda-gpu-arch=sm_30 --cuda-gpu-arch=sm_35 %s --cuda-device-only -S 2>&1 \
+// RUN: | FileCheck -check-prefix=DASM2 %s
+// DASM2: 0: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_30)
+// DASM2: 1: preprocessor, {0}, cuda-cpp-output, (device-cuda, sm_30)
+// DASM2: 2: compiler, {1}, ir, (device-cuda, sm_30)
+// DASM2: 3: backend, {2}, assembler, (device-cuda, sm_30)
+// DASM2: 4: offload, "device-cuda (nvptx64-nvidia-cuda:sm_30)" {3}, assembler
+// DASM2: 5: input, "{{.*}}cuda_phases.cu", cuda, (device-cuda, sm_35)
+// DASM2: 6: preprocessor, {5}, cuda-cpp-output, (device-cuda, sm_35)
+// DASM2: 7: compiler, {6}, ir, (device-cuda, sm_35)
+// DASM2: 8: backend, {7}, assembler, (device-cuda, sm_35)
+// DASM2: 9: offload, "device-cuda (nvptx64-nvidia-cuda:sm_35)" {8}, assembler
diff --git a/test/Driver/darwin-embedded.c b/test/Driver/darwin-embedded.c
index 66b7bd9fa7fbd..beb8b195c42d1 100644
--- a/test/Driver/darwin-embedded.c
+++ b/test/Driver/darwin-embedded.c
@@ -1,6 +1,6 @@
 // RUN: %clang -target x86_64-apple-darwin -arch armv6m -resource-dir=%S/Inputs/resource_dir %s -### 2> %t
 // RUN: %clang -target x86_64-apple-darwin -arch armv7em -resource-dir=%S/Inputs/resource_dir %s -### 2>> %t
-// RUN: %clang -target x86_64-apple-darwin -arch armv7em -mhard-float -resource-dir=%S/Inputs/resource_dir %s -### 2>> %t
+// RUN: %clang -target x86_64-apple-darwin -arch armv7em -mfloat-abi=soft -resource-dir=%S/Inputs/resource_dir %s -### 2>> %t
 
 // RUN: %clang -target x86_64-apple-darwin -arch armv7m -fPIC -resource-dir=%S/Inputs/resource_dir %s -### 2>> %t
 // RUN: %clang -target x86_64-apple-darwin -arch armv7em -fPIC -mfloat-abi=hard -resource-dir=%S/Inputs/resource_dir %s -### 2>> %t
@@ -17,17 +17,18 @@
 // CHECK: "-mfloat-abi" "soft"
 // CHECK: libclang_rt.soft_static.a
 
-// ARMv7em does, but defaults to soft
+// ARMv7em does
 // CHECK-LABEL: Target:
 // CHECK-NOT: warning: unknown platform
-// CHECK: "-mfloat-abi" "soft"
-// CHECK: libclang_rt.soft_static.a
+// CHECK: "-mfloat-abi" "hard"
+// CHECK: libclang_rt.hard_static.a
 
-// Which can be overridden
+// but the ABI can be overridden
 // CHECK-LABEL: Target:
 // CHECK-NOT: warning: unknown platform
-// CHECK: "-mfloat-abi" "hard"
-// CHECK: libclang_rt.hard_static.a
+// CHECK: "-target-feature" "+soft-float"
+// CHECK: "-mfloat-abi" "soft"
+// CHECK: libclang_rt.soft_static.a
 
 // ARMv7m has no float either
 // CHECK-LABEL: Target:
diff --git a/test/Driver/darwin-iphone-defaults.m b/test/Driver/darwin-iphone-defaults.m
index 3e2a9125db5a8..63bbbe0a9c33e 100644
--- a/test/Driver/darwin-iphone-defaults.m
+++ b/test/Driver/darwin-iphone-defaults.m
@@ -1,4 +1,4 @@
-// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -arch armv7 -flto -S -o - %s | FileCheck %s
+// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -arch armv7 -stdlib=platform -flto -S -o - %s | FileCheck %s
 
 // CHECK: @f0() [[F0:#[0-9]+]]
 // CHECK: @__f0_block_invoke
diff --git a/test/Driver/darwin-ld.c b/test/Driver/darwin-ld.c
index ffef3b14785c6..a7681fad8b63e 100644
--- a/test/Driver/darwin-ld.c
+++ b/test/Driver/darwin-ld.c
@@ -152,6 +152,15 @@
 // RUN: FileCheck -check-prefix=LINK_NO_IOS_ARM64_CRT1 %s < %t.log
 // LINK_NO_IOS_ARM64_CRT1-NOT: crt
 
+// RUN: %clang -target x86_64-apple-ios6.0 -miphoneos-version-min=6.0 -fprofile-instr-generate -### %t.o 2> %t.log
+// RUN: FileCheck -check-prefix=LINK_IOSSIM_PROFILE %s < %t.log
+// LINK_IOSSIM_PROFILE: {{ld(.exe)?"}}
+// LINK_IOSSIM_PROFILE: libclang_rt.profile_iossim.a
+
+// FIXME: Currently the builtin library is only added to the command line if it,
+// so we can't check for it here
+// FIXME_LINK_IOSSIM_PROFILE: libclang_rt.ios.a
+
 // RUN: %clang -target arm64-apple-tvos8.3 -mtvos-version-min=8.3 -### %t.o 2> %t.log
 // RUN: FileCheck -check-prefix=LINK_TVOS_ARM64 %s < %t.log
 // LINK_TVOS_ARM64: {{ld(.exe)?"}}
@@ -294,3 +303,27 @@
 // RUN:   FileCheck --check-prefix=LINK-IFRAMEWORK %s
 // LINK-IFRAMEWORK: {{ld(.exe)?"}}
 // LINK-IFRAMEWORK: "-FBar"
+
+// Check ld64 accepts up to 5 digits with no extra characters
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3 2> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0 2>> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0.1 2>> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0.1.2 2>> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0.1.2.6 2>> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0.1.a 2>> %t.log
+// RUN: %clang -target x86_64-apple-darwin12 %s -### -o %t \
+// RUN:   -mlinker-version=133.3.0.1a 2>> %t.log
+// RUN: FileCheck -check-prefix=LINK_VERSION_DIGITS %s < %t.log
+// LINK_VERSION_DIGITS-NOT: invalid version number in '-mlinker-version=133.3'
+// LINK_VERSION_DIGITS-NOT: invalid version number in '-mlinker-version=133.3.0'
+// LINK_VERSION_DIGITS-NOT: invalid version number in '-mlinker-version=133.3.0.1'
+// LINK_VERSION_DIGITS-NOT: invalid version number in '-mlinker-version=133.3.0.1.2'
+// LINK_VERSION_DIGITS: invalid version number in '-mlinker-version=133.3.0.1.2.6'
+// LINK_VERSION_DIGITS: invalid version number in '-mlinker-version=133.3.0.1.a'
+// LINK_VERSION_DIGITS: invalid version number in '-mlinker-version=133.3.0.1a'
diff --git a/test/Driver/darwin-multiarch-arm.c b/test/Driver/darwin-multiarch-arm.c
new file mode 100644
index 0000000000000..32d4c1f3826b7
--- /dev/null
+++ b/test/Driver/darwin-multiarch-arm.c
@@ -0,0 +1,14 @@
+// Check that we compile correctly with multiple ARM -arch options.
+//
+// RUN: %clang -target arm7-apple-darwin10 -### \
+// RUN:   -arch armv7 -arch armv7s %s 2>&1 | FileCheck %s
+
+// CHECK: "-cc1" "-triple" "thumbv7-apple-ios5.0.0"
+// CHECK-SAME: "-o" "[[CC_OUT1:[^"]*]]"
+// CHECK:ld{{(\.exe)?}}" {{.*}} "-o" "[[LD_OUT1:[^"]*]]" {{.*}} "[[CC_OUT1]]"
+// CHECK:"-cc1" "-triple" "thumbv7s-apple-ios5.0.0"
+// CHECK-SAME: "-o" "[[CC_OUT2:[^"]*]]"
+// CHECK:ld{{(\.exe)?}}" {{.*}} "-o" "[[LD_OUT2:[^"]*]]" {{.*}} "[[CC_OUT2]]"
+// CHECK:lipo"
+// CHECK-DAG: "[[LD_OUT1]]"
+// CHECK-DAG: "[[LD_OUT2]]"
diff --git a/test/Driver/darwin-objc-gc.m b/test/Driver/darwin-objc-gc.m
index 06e3aea9847b2..aac6dc16c3728 100644
--- a/test/Driver/darwin-objc-gc.m
+++ b/test/Driver/darwin-objc-gc.m
@@ -1,6 +1,6 @@
 // Check that we warn, but accept, -fobjc-gc for iPhone OS.
 
-// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -fobjc-gc -flto -S -o %t %s 2> %t.err
+// RUN: %clang -target i386-apple-darwin9 -miphoneos-version-min=3.0 -stdlib=platform -fobjc-gc -flto -S -o %t %s 2> %t.err
 // RUN: FileCheck --check-prefix=IPHONE_OBJC_GC_LL %s < %t 
 // RUN: FileCheck --check-prefix=IPHONE_OBJC_GC_STDERR %s < %t.err
 
diff --git a/test/Driver/darwin-sanitizer-ld.c b/test/Driver/darwin-sanitizer-ld.c
index fb318ebd4130f..53c7fce115e71 100644
--- a/test/Driver/darwin-sanitizer-ld.c
+++ b/test/Driver/darwin-sanitizer-ld.c
@@ -1,26 +1,17 @@
 // Test sanitizer link flags on Darwin.
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=address %s -o %t.o 2>&1 \
+// RUN:   -stdlib=platform -fsanitize=address %s -o %t.o 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN %s
 
 // CHECK-ASAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN: stdc++
+// CHECK-ASAN-NOT: "-lstdc++"
+// CHECK-ASAN-NOT: "-lc++"
 // CHECK-ASAN: libclang_rt.asan_osx_dynamic.dylib"
 // CHECK-ASAN: "-rpath" "@executable_path"
 // CHECK-ASAN: "-rpath" "{{.*}}lib{{.*}}darwin"
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=address -mios-simulator-version-min=7.0 %s -o %t.o 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-ASAN-IOSSIM %s
-
-// CHECK-ASAN-IOSSIM: "{{.*}}ld{{(.exe)?}}"
-// CHECK-ASAN-IOSSIM: lc++
-// CHECK-ASAN-IOSSIM: libclang_rt.asan_iossim_dynamic.dylib"
-// CHECK-ASAN-IOSSIM: "-rpath" "@executable_path"
-// CHECK-ASAN-IOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
-
-// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
 // RUN:   -fPIC -shared -fsanitize=address %s -o %t.so 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-DYN-ASAN %s
 
@@ -31,11 +22,12 @@
 // CHECK-DYN-ASAN: "-rpath" "{{.*}}lib{{.*}}darwin"
 
 // RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
-// RUN:   -fsanitize=undefined %s -o %t.o 2>&1 \
+// RUN:   -stdlib=platform -fsanitize=undefined %s -o %t.o 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN %s
 
 // CHECK-UBSAN: "{{.*}}ld{{(.exe)?}}"
-// CHECK-UBSAN: stdc++
+// CHECK-UBSAN-NOT: "-lstdc++"
+// CHECK-UBSAN-NOT: "-lc++"
 // CHECK-UBSAN: libclang_rt.ubsan_osx_dynamic.dylib"
 // CHECK-UBSAN: "-rpath" "@executable_path"
 // CHECK-UBSAN: "-rpath" "{{.*}}lib{{.*}}darwin"
@@ -65,3 +57,71 @@
 
 // CHECK-DYN-BOUNDS: "{{.*}}ld{{(.exe)?}}"
 // CHECK-DYN-BOUNDS-NOT: ubsan_osx
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address -mios-simulator-version-min=7.0 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-IOSSIM %s
+
+// CHECK-ASAN-IOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-IOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-IOSSIM-NOT: "-lc++"
+// CHECK-ASAN-IOSSIM: libclang_rt.asan_iossim_dynamic.dylib"
+// CHECK-ASAN-IOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-IOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address \
+// RUN:   -mtvos-simulator-version-min=8.3.0 %s -o %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-TVOSSIM %s
+
+// CHECK-ASAN-TVOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-TVOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-TVOSSIM-NOT: "-lc++"
+// CHECK-ASAN-TVOSSIM: libclang_rt.asan_tvossim_dynamic.dylib"
+// CHECK-ASAN-TVOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-TVOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target x86_64-darwin \
+// RUN:   -stdlib=platform -fsanitize=address \
+// RUN:   -mwatchos-simulator-version-min=2.0.0 %s -o %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-WATCHOSSIM %s
+
+// CHECK-ASAN-WATCHOSSIM: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-WATCHOSSIM-NOT: "-lstdc++"
+// CHECK-ASAN-WATCHOSSIM-NOT: "-lc++"
+// CHECK-ASAN-WATCHOSSIM: libclang_rt.asan_watchossim_dynamic.dylib"
+// CHECK-ASAN-WATCHOSSIM: "-rpath" "@executable_path"
+// CHECK-ASAN-WATCHOSSIM: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target armv7-apple-ios  \
+// RUN:   -stdlib=platform -fsanitize=address -miphoneos-version-min=7 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-IOS %s
+
+// CHECK-ASAN-IOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-IOS-NOT: "-lstdc++"
+// CHECK-ASAN-IOS-NOT: "-lc++"
+// CHECK-ASAN-IOS: libclang_rt.asan_ios_dynamic.dylib"
+// CHECK-ASAN-IOS: "-rpath" "@executable_path"
+// CHECK-ASAN-IOS: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target arm64-apple-tvos \
+// RUN:   -stdlib=platform -fsanitize=address -mtvos-version-min=8.3 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-TVOS %s
+
+// CHECK-ASAN-TVOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-TVOS-NOT: "-lstdc++"
+// CHECK-ASAN-TVOS-NOT: "-lc++"
+// CHECK-ASAN-TVOS: libclang_rt.asan_tvos_dynamic.dylib"
+// CHECK-ASAN-TVOS: "-rpath" "@executable_path"
+// CHECK-ASAN-TVOS: "-rpath" "{{.*}}lib{{.*}}darwin"
+
+// RUN: %clang -no-canonical-prefixes -### -target armv7k-apple-watchos \
+// RUN:   -stdlib=platform -fsanitize=address -mwatchos-version-min=2.0 \
+// RUN:   %s -o %t.o 2>&1 | FileCheck --check-prefix=CHECK-ASAN-WATCHOS %s
+
+// CHECK-ASAN-WATCHOS: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-WATCHOS-NOT: "-lstdc++"
+// CHECK-ASAN-WATCHOS-NOT: "-lc++"
+// CHECK-ASAN-WATCHOS: libclang_rt.asan_watchos_dynamic.dylib"
+// CHECK-ASAN-WATCHOS: "-rpath" "@executable_path"
+// CHECK-ASAN-WATCHOS: "-rpath" "{{.*}}lib{{.*}}darwin"
diff --git a/test/Driver/darwin-stdlib.cpp b/test/Driver/darwin-stdlib.cpp
new file mode 100644
index 0000000000000..c9be6075bb074
--- /dev/null
+++ b/test/Driver/darwin-stdlib.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang -target x86_64-apple-darwin -arch arm64 -miphoneos-version-min=7.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -mmacosx-version-min=10.8 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBSTDCXX
+// RUN: %clang -target x86_64-apple-darwin -mmacosx-version-min=10.9 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -miphoneos-version-min=6.1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBSTDCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -miphoneos-version-min=7.0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+// RUN: %clang -target x86_64-apple-darwin -arch armv7k %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-LIBCXX
+
+// The purpose of this test is that the libc++ headers should be found
+// properly. At the moment this is done by passing -stdlib=libc++ down to the
+// cc1 invocation. If and when we change to finding them in the driver this test
+// should reflect that.
+
+// CHECK-LIBCXX: -stdlib=libc++
+
+// CHECK-LIBSTDCXX-NOT: -stdlib=libc++
+// CHECK-LIBSTDCXX-NOT: -stdlib=libstdc++
diff --git a/test/Driver/dyld-prefix.c b/test/Driver/dyld-prefix.c
index 2c2bc4ff88ea8..5a79874b56745 100644
--- a/test/Driver/dyld-prefix.c
+++ b/test/Driver/dyld-prefix.c
@@ -1,10 +1,10 @@
 // RUN: touch %t.o
 
 // RUN: %clang -target i386-unknown-linux --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-32 %s
-// CHECK-32: "-dynamic-linker" "/foo/lib/ld-linux.so.2"
+// CHECK-32: "-dynamic-linker" "/foo{{(/usr/i386-unknown-linux)?}}/lib/ld-linux.so.2"
 
 // RUN: %clang -target x86_64-unknown-linux --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-64 %s
-// CHECK-64: "-dynamic-linker" "/foo/lib64/ld-linux-x86-64.so.2"
+// CHECK-64: "-dynamic-linker" "/foo{{(/usr/x86_64-unknown-linux)?}}/lib{{(64)?}}/ld-linux-x86-64.so.2"
 
 // RUN: %clang -target x86_64-unknown-linux-gnux32 --dyld-prefix /foo -### %t.o 2>&1 | FileCheck --check-prefix=CHECK-X32 %s
-// CHECK-X32: "-dynamic-linker" "/foo/libx32/ld-linux-x32.so.2"
+// CHECK-X32: "-dynamic-linker" "/foo{{(/x86_64-unknown-linux-gnux32)?}}/lib{{(x32)?}}/ld-linux-x32.so.2"
diff --git a/test/Driver/dynamic-linker.c b/test/Driver/dynamic-linker.c
new file mode 100644
index 0000000000000..c7579f4af15bd
--- /dev/null
+++ b/test/Driver/dynamic-linker.c
@@ -0,0 +1,32 @@
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target i386-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-DYNAMIC-LINKER %s
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -shared /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED %s
+
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -shared -rdynamic /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-SHARED -check-prefix CHECK-RDYNAMIC %s
+
+// RUN: %clang -target armv7-unknown-linux-gnueabi -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target i386-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target mips64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target powerpc64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+// RUN: %clang -target x86_64-unknown-linux-gnu -### -static /dev/null -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-STATIC %s
+
+// CHECK-RDYNAMIC: "-export-dynamic"
+// CHECK-SHARED: "-shared"
+// CHECK-STATIC: "-{{B?}}static"
+// CHECK-DYNAMIC-LINKER: "-dynamic-linker"
+// CHECK-SHARED-NOT: "-dynamic-linker"
+// CHECK-STATIC-NOT: "-dynamic-linker"
+
diff --git a/test/Driver/embed-bitcode.c b/test/Driver/embed-bitcode.c
new file mode 100644
index 0000000000000..da60da3fff7b5
--- /dev/null
+++ b/test/Driver/embed-bitcode.c
@@ -0,0 +1,43 @@
+// RUN: %clang -ccc-print-bindings -c %s -fembed-bitcode 2>&1 | FileCheck %s
+// CHECK: clang
+// CHECK: clang
+
+// RUN: %clang %s -c -fembed-bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-CC
+// CHECK-CC: -cc1
+// CHECK-CC: -emit-llvm-bc
+// CHECK-CC: -cc1
+// CHECK-CC: -emit-obj
+// CHECK-CC: -fembed-bitcode=all
+
+// RUN: %clang %s -c -fembed-bitcode=bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-BITCODE
+// CHECK-BITCODE: -cc1
+// CHECK-BITCODE: -emit-llvm-bc
+// CHECK-BITCODE: -cc1
+// CHECK-BITCODE: -emit-obj
+// CHECK-BITCODE: -fembed-bitcode=bitcode
+//
+// RUN: %clang %s -c -save-temps -fembed-bitcode -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-SAVE-TEMP
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -E
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -emit-llvm-bc
+// CHECK-SAVE-TEMP: -cc1
+// CHECK-SAVE-TEMP: -S
+// CHECK-SAVE-TEMP: -fembed-bitcode=all
+// CHECK-SAVE-TEMP: -cc1as
+
+// RUN: %clang -c %s -flto -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// RUN: %clang -c %s -flto=full -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// RUN: %clang -c %s -flto=thin -fembed-bitcode 2>&1 -### | FileCheck %s -check-prefix=CHECK-LTO
+// CHECK-LTO: -cc1
+// CHECK-LTO: -emit-llvm-bc
+// CHECK-LTO-NOT: warning: argument unused during compilation: '-fembed-bitcode'
+// CHECK-LTO-NOT: -cc1
+// CHECK-LTO-NOT: -fembed-bitcode=all
+
+// RUN: %clang -c %s -fembed-bitcode-marker -fintegrated-as 2>&1 -### | FileCheck %s -check-prefix=CHECK-MARKER
+// CHECK-MARKER: -cc1
+// CHECK-MARKER: -emit-obj
+// CHECK-MARKER: -fembed-bitcode=marker
+// CHECK-MARKER-NOT: -cc1
+
diff --git a/test/Driver/emulated-tls.cpp b/test/Driver/emulated-tls.cpp
new file mode 100644
index 0000000000000..a18c2e220bf87
--- /dev/null
+++ b/test/Driver/emulated-tls.cpp
@@ -0,0 +1,5 @@
+// Cygwin uses emutls. Clang should pass -femulated-tls to cc1 and cc1 should pass EmulatedTLS to LLVM CodeGen.
+// FIXME: Add more targets here to use emutls.
+// RUN: %clang -### -std=c++11 -target i686-pc-cygwin %s 2>&1 | FileCheck %s
+
+// CHECK: "-cc1" {{.*}}"-femulated-tls"
diff --git a/test/Driver/esan.c b/test/Driver/esan.c
new file mode 100644
index 0000000000000..795104114065a
--- /dev/null
+++ b/test/Driver/esan.c
@@ -0,0 +1,12 @@
+// RUN: %clang     -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O1 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O2 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O3 -target x86_64-unknown-linux -fsanitize=efficiency-cache-frag %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang     -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O1 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O2 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// RUN: %clang -O3 -target x86_64-unknown-linux -fsanitize=efficiency-working-set %s -S -emit-llvm -o - | FileCheck %s
+// Verify that -fsanitize=efficiency-* invokes esan instrumentation.
+
+int foo(int *a) { return *a; }
+// CHECK: __esan_init
diff --git a/test/Driver/frame-pointer.c b/test/Driver/frame-pointer.c
index 1d63f2c42920a..cec168636c134 100644
--- a/test/Driver/frame-pointer.c
+++ b/test/Driver/frame-pointer.c
@@ -10,6 +10,7 @@
 // RUN: %clang -target x86_64-pc-linux -### -S -O2 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK2-64 %s
 // RUN: %clang -target x86_64-pc-linux -### -S -O3 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK3-64 %s
 // RUN: %clang -target x86_64-pc-linux -### -S -Os %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECKs-64 %s
+// RUN: %clang -target x86_64-pc-win32-macho -### -S -O3 %s -o %t.s 2>&1 | FileCheck -check-prefix=CHECK-MACHO-64 %s
 
 // Trust the above to get the optimizations right, and just test other targets
 // that want this by default.
@@ -36,3 +37,4 @@
 // CHECK2-64-NOT: -mdisable-fp-elim
 // CHECK3-64-NOT: -mdisable-fp-elim
 // CHECKs-64-NOT: -mdisable-fp-elim
+// CHECK-MACHO-64: -mdisable-fp-elim
diff --git a/test/Driver/freebsd-mips-as.c b/test/Driver/freebsd-mips-as.c
index 7555888e066ec..af02c38693da2 100644
--- a/test/Driver/freebsd-mips-as.c
+++ b/test/Driver/freebsd-mips-as.c
@@ -45,11 +45,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS64-DEF-EL-AS %s
 // MIPS64-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64" "-EL"
 //
-// RUN: %clang -target mips-unknown-freebsd -mabi=eabi -### \
-// RUN:   -no-integrated-as -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-EABI %s
-// MIPS-EABI: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "eabi" "-EB"
-//
 // RUN: %clang -target mips64-unknown-freebsd -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
diff --git a/test/Driver/freebsd.c b/test/Driver/freebsd.c
index 45e92043619bc..f008b76b93ae4 100644
--- a/test/Driver/freebsd.c
+++ b/test/Driver/freebsd.c
@@ -82,6 +82,7 @@
 // RUN: %clang -no-canonical-prefixes -target x86_64-pc-freebsd8 -static %s \
 // RUN:   --sysroot=%S/Inputs/multiarch_freebsd64_tree -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-STATIC %s
+// CHECK-STATIC: ld{{.*}}" "--eh-frame-hdr" "-Bstatic"
 // CHECK-STATIC: crt1.o
 // CHECK-STATIC: crtbeginT.o
 
diff --git a/test/Driver/freebsd.cpp b/test/Driver/freebsd.cpp
index 175b873bf4025..baf52f77dd07f 100644
--- a/test/Driver/freebsd.cpp
+++ b/test/Driver/freebsd.cpp
@@ -1,13 +1,13 @@
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 2>&1 \
+// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-TEN %s
-// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd9.2 2>&1 \
+// RUN: %clangxx %s -### -o %t.o -target amd64-unknown-freebsd9.2 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-NINE %s
 // CHECK-TEN: "-lc++" "-lm"
 // CHECK-NINE: "-lstdc++" "-lm"
 
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd10.0 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd10.0 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-TEN %s
-// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd9.2 2>&1 \
+// RUN: %clangxx %s -### -pg -o %t.o -target amd64-unknown-freebsd9.2 -stdlib=platform 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PG-NINE %s
 // CHECK-PG-TEN: "-lc++_p" "-lm_p"
 // CHECK-PG-NINE: "-lstdc++_p" "-lm_p"
diff --git a/test/Driver/fsanitize-coverage.c b/test/Driver/fsanitize-coverage.c
index fdaa9faf8902e..16c5dfe09937d 100644
--- a/test/Driver/fsanitize-coverage.c
+++ b/test/Driver/fsanitize-coverage.c
@@ -2,41 +2,51 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge -fsanitize-coverage=0 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-0
 // CHECK-SANITIZE-COVERAGE-0-NOT: fsanitize-coverage-type
+// CHECK-SANITIZE-COVERAGE-0: -fsanitize=address
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
-// CHECK-SANITIZE-COVERAGE-1: fsanitize-coverage-type=1
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=memory -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=leak -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=undefined -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=bool -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// CHECK-SANITIZE-COVERAGE-FUNC: fsanitize-coverage-type=1
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-2
-// CHECK-SANITIZE-COVERAGE-2: fsanitize-coverage-type=2
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-BB
+// CHECK-SANITIZE-COVERAGE-BB: fsanitize-coverage-type=2
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-3
-// CHECK-SANITIZE-COVERAGE-3: fsanitize-coverage-type=3
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-EDGE
+// CHECK-SANITIZE-COVERAGE-EDGE: fsanitize-coverage-type=3
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=4 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-4
-// CHECK-SANITIZE-COVERAGE-4: fsanitize-coverage-type=3
-// CHECK-SANITIZE-COVERAGE-4: fsanitize-coverage-indirect-calls
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC_INDIR
+// CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-type=3
+// CHECK-SANITIZE-COVERAGE-FUNC_INDIR: fsanitize-coverage-indirect-calls
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-1
+// CHECK-SANITIZE-COVERAGE-1: warning: argument '-fsanitize-coverage=1' is deprecated, use '-fsanitize-coverage=func' instead
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=2 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-2
+// CHECK-SANITIZE-COVERAGE-2: warning: argument '-fsanitize-coverage=2' is deprecated, use '-fsanitize-coverage=bb' instead
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=3 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-3
+// CHECK-SANITIZE-COVERAGE-3: warning: argument '-fsanitize-coverage=3' is deprecated, use '-fsanitize-coverage=edge' instead
+//
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=5 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-5
 // CHECK-SANITIZE-COVERAGE-5: error: unsupported argument '5' to option 'fsanitize-coverage='
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread   -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
-// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=1 %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
-// CHECK-SANITIZE-COVERAGE-UNUSED: argument unused during compilation: '-fsanitize-coverage=1'
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=thread   -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-UNUSED
+// RUN: %clang -target x86_64-linux-gnu                     -fsanitize-coverage=func %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FUNC
+// CHECK-SANITIZE-COVERAGE-UNUSED: argument unused during compilation: '-fsanitize-coverage=func'
+// CHECK-SANITIZE-COVERAGE-UNUSED-NOT: -fsanitize-coverage-type=1
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 -fno-sanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-SAN-DISABLED
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fno-sanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-SAN-DISABLED
 // CHECK-SANITIZE-COVERAGE-SAN-DISABLED-NOT: argument unused
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-bb,trace-cmp,8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=edge,indirect-calls,trace-bb,trace-pc,trace-cmp,8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANITIZE-COVERAGE-FEATURES
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-type=3
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-indirect-calls
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-bb
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-cmp
 // CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-8bit-counters
+// CHECK-SANITIZE-COVERAGE-FEATURES: -fsanitize-coverage-trace-pc
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func,edge,indirect-calls,trace-bb,trace-cmp -fno-sanitize-coverage=edge,indirect-calls,trace-bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MASK
 // CHECK-MASK: -fsanitize-coverage-type=1
@@ -52,19 +62,27 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=8bit-counters %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-MISSING-TYPE
 // CHECK-MISSING-TYPE: error: invalid argument '-fsanitize-coverage=8bit-counters' only allowed with '-fsanitize-coverage=(func|bb|edge)'
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=edge,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_EDGE
+// CHECK-TRACE_PC_EDGE: -fsanitize-coverage-type=3
+// CHECK-TRACE_PC_EDGE: -fsanitize-coverage-trace-pc
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-coverage=func,trace-pc %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TRACE_PC_FUNC
+// CHECK-TRACE_PC_FUNC: -fsanitize-coverage-type=1
+// CHECK-TRACE_PC_FUNC: -fsanitize-coverage-trace-pc
+
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=trace-cmp,indirect-calls %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NO-TYPE-NECESSARY
 // CHECK-NO-TYPE-NECESSARY-NOT: error:
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-indirect-calls
 // CHECK-NO-TYPE-NECESSARY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=1 -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-coverage=func -fsanitize-coverage=trace-cmp %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-EXTEND-LEGACY
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-type=1
 // CHECK-EXTEND-LEGACY: -fsanitize-coverage-trace-cmp
 
-// RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=1 -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
+// RUN: %clang_cl --target=i386-pc-win32 -fsanitize=address -fsanitize-coverage=func -c -### -- %s 2>&1 | FileCheck %s -check-prefix=CLANG-CL-COVERAGE
 // CLANG-CL-COVERAGE-NOT: error:
 // CLANG-CL-COVERAGE-NOT: warning:
 // CLANG-CL-COVERAGE-NOT: argument unused
 // CLANG-CL-COVERAGE-NOT: unknown argument
-// CLANG-CL-COVERAGE: -fsanitize=address
 // CLANG-CL-COVERAGE: -fsanitize-coverage-type=1
+// CLANG-CL-COVERAGE: -fsanitize=address
diff --git a/test/Driver/fsanitize.c b/test/Driver/fsanitize.c
index 3d7713dfd7b3a..b0cef81bc2544 100644
--- a/test/Driver/fsanitize.c
+++ b/test/Driver/fsanitize.c
@@ -20,10 +20,15 @@
 // RUN: %clang -target i386-pc-win32 -fsanitize=undefined -x c++ %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN32 --check-prefix=CHECK-UNDEFINED-WIN-CXX
 // RUN: %clang -target x86_64-pc-win32 -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN64
 // RUN: %clang -target x86_64-pc-win32 -fsanitize=undefined -x c++ %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UNDEFINED-WIN --check-prefix=CHECK-UNDEFINED-WIN64 --check-prefix=CHECK-UNDEFINED-WIN-CXX
-// CHECK-UNDEFINED-WIN: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|float-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|object-size|float-cast-overflow|array-bounds|enum|bool|returns-nonnull-attribute|nonnull-attribute),?){17}"}}
-// CHECK-UNDEFINED-WIN32-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
-// CHECK-UNDEFINED-WIN64-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
-// CHECK-UNDEFINED-WIN-CXX-SAME: "--dependent-lib={{[^"]*}}ubsan_standalone_cxx{{[^"]*}}.lib"
+// CHECK-UNDEFINED-WIN32: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
+// CHECK-UNDEFINED-WIN64: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
+// CHECK-UNDEFINED-WIN-CXX: "--dependent-lib={{[^"]*}}ubsan_standalone_cxx{{[^"]*}}.lib"
+// CHECK-UNDEFINED-WIN-SAME: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|float-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|object-size|float-cast-overflow|array-bounds|enum|bool|returns-nonnull-attribute|nonnull-attribute),?){17}"}}
+
+// RUN: %clang -target i386-pc-win32 -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COVERAGE-WIN32
+// CHECK-COVERAGE-WIN32: "--dependent-lib={{[^"]*}}ubsan_standalone-i386.lib"
+// RUN: %clang -target x86_64-pc-win32 -fsanitize-coverage=bb %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-COVERAGE-WIN64
+// CHECK-COVERAGE-WIN64: "--dependent-lib={{[^"]*}}ubsan_standalone-x86_64.lib"
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=integer %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INTEGER
 // CHECK-INTEGER: "-fsanitize={{((signed-integer-overflow|unsigned-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent),?){5}"}}
@@ -83,6 +88,35 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=kernel-address,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANKA-SANL
 // CHECK-SANKA-SANL: '-fsanitize=kernel-address' not allowed with '-fsanitize=leak'
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANA
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANA
+// CHECK-SANE-SANA: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANL
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,leak -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANL
+// CHECK-SANE-SANL: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=leak'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,thread -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANT
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,thread -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANT
+// CHECK-SANE-SANT: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=thread'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,memory -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANM
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,memory -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANM
+// CHECK-SANE-SANM: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=memory'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag,kernel-address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANKA
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set,kernel-address -pie -fno-rtti %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANE-SANKA
+// CHECK-SANE-SANKA: '-fsanitize=efficiency-{{.*}}' not allowed with '-fsanitize=kernel-address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize-address-use-after-scope %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-USE-AFTER-SCOPE
+// CHECK-ONLY-USE-AFTER-SCOPE: '-fsanitize-address-use-after-scope' only allowed with '-fsanitize=address'
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address -fsanitize-address-use-after-scope %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-USE-AFTER-SCOPE
+// CHECK-USE-AFTER-SCOPE: -cc1{{.*}}-fsanitize-address-use-after-scope
+
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-NO-USE-AFTER-SCOPE
+// CHECK-ASAN-NO-USE-AFTER-SCOPE-NOT: -cc1{{.*}}-fsanitize-address-use-after-scope
+
 // RUN: %clang -target x86_64-linux-gnu -fsanitize-memory-track-origins -pie %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ONLY-TRACK-ORIGINS
 // CHECK-ONLY-TRACK-ORIGINS: warning: argument unused during compilation: '-fsanitize-memory-track-origins'
 
@@ -154,7 +188,7 @@
 // CHECK-NO-PIE: "-mrelocation-model" "static"
 // CHECK-NO-PIE-NOT: "-pie"
 
-// CHECK-PIE: "-mrelocation-model" "pic" "-pic-level" "2" "-pie-level" "2"
+// CHECK-PIE: "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie"
 // CHECK-PIE: "-pie"
 
 // RUN: %clang -target arm-linux-androideabi %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ANDROID-NO-ASAN
@@ -181,8 +215,8 @@
 // CHECK-DIAG-RECOVER: unsupported argument 'unreachable' to option 'fsanitize-recover='
 
 // RUN: %clang -target x86_64-linux-gnu %s -fsanitize=undefined -fsanitize-recover -fno-sanitize-recover -### 2>&1 | FileCheck %s --check-prefix=CHECK-DEPRECATED-RECOVER
-// CHECK-DEPRECATED-RECOVER: argument '-fsanitize-recover' is deprecated, use '-fsanitize-recover=undefined,integer' instead
-// CHECK-DEPRECATED-RECOVER: argument '-fno-sanitize-recover' is deprecated, use '-fno-sanitize-recover=undefined,integer' instead
+// CHECK-DEPRECATED-RECOVER: argument '-fsanitize-recover' is deprecated, use '-fsanitize-recover=undefined,integer' or '-fsanitize-recover=all' instead
+// CHECK-DEPRECATED-RECOVER: argument '-fno-sanitize-recover' is deprecated, use '-fno-sanitize-recover=undefined,integer' or '-fno-sanitize-recover=all' instead
 // CHECK-DEPRECATED-RECOVER-NOT: is deprecated
 
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=leak %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-SANL
@@ -217,6 +251,27 @@
 // CHECK-TSAN-MSAN-MSAN-DARWIN: unsupported option '-fsanitize=memory' for target 'x86_64-apple-darwin10'
 // CHECK-TSAN-MSAN-MSAN-DARWIN-NOT: unsupported option
 
+// RUN: %clang -target x86_64-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-DARWIN
+// CHECK-TSAN-X86-64-DARWIN-NOT: unsupported option
+
+// RUN: %clang -target x86_64-apple-iossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-IOSSIMULATOR
+// CHECK-TSAN-X86-64-IOSSIMULATOR-NOT: unsupported option
+
+// RUN: %clang -target x86_64-apple-tvossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-TVOSSIMULATOR
+// CHECK-TSAN-X86-64-TVOSSIMULATOR-NOT: unsupported option
+
+// RUN: %clang -target i386-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-I386-DARWIN
+// CHECK-TSAN-I386-DARWIN: unsupported option '-fsanitize=thread' for target 'i386-apple-darwin'
+
+// RUN: %clang -target arm-apple-ios -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-ARM-IOS
+// CHECK-TSAN-ARM-IOS: unsupported option '-fsanitize=thread' for target 'arm-apple-ios'
+
+// RUN: %clang -target i386-apple-iossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-I386-IOSSIMULATOR
+// CHECK-TSAN-I386-IOSSIMULATOR: unsupported option '-fsanitize=thread' for target 'i386-apple-iossimulator'
+
+// RUN: %clang -target i386-apple-tvossimulator -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-I386-TVOSSIMULATOR
+// CHECK-TSAN-I386-TVOSSIMULATOR: unsupported option '-fsanitize=thread' for target 'i386-apple-tvossimulator'
+
 // RUN: %clang -target x86_64-apple-darwin10 -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FSAN-DARWIN
 // CHECK-FSAN-DARWIN: unsupported option '-fsanitize=function' for target 'x86_64-apple-darwin10'
 
@@ -230,26 +285,60 @@
 // CHECK-VPTR-DARWIN-NEW: -fsanitize=alignment,vptr
 
 // RUN: %clang -target armv7-apple-ios7 -miphoneos-version-min=7.0 -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-IOS
-// CHECK-ASAN-IOS: unsupported option '-fsanitize=address' for target 'arm-apple-ios7'
+// CHECK-ASAN-IOS: -fsanitize=address
 
 // RUN: %clang -target i386-pc-openbsd -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-OPENBSD
 // CHECK-ASAN-OPENBSD: unsupported option '-fsanitize=address' for target 'i386-pc-openbsd'
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
-// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi-derived-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-DCAST
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi-unrelated-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-UCAST
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-nvcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NVCALL
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-vcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-VCALL
+// RUN: %clang -target i686-linux-gnu -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-X86
+// RUN: %clang -target i686-linux-gnu -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-X86
+// CHECK-ESAN-X86: error: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i686--linux-gnu'
+
+// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-DARWIN
+// RUN: %clang -target x86_64-apple-darwin10 -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-DARWIN
+// CHECK-ESAN-DARWIN: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'x86_64-apple-darwin10'
+
+// RUN: %clang -target i386-apple-darwin -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-DARWIN
+// RUN: %clang -target i386-apple-darwin -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-DARWIN
+// CHECK-ESAN-I386-DARWIN: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-darwin'
+
+// RUN: %clang -target arm-apple-ios -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-ARM-IOS
+// RUN: %clang -target arm-apple-ios -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-ARM-IOS
+// CHECK-ESAN-ARM-IOS: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'arm-apple-ios'
+
+// RUN: %clang -target i386-apple-iossimulator -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-IOSSIMULATOR
+// RUN: %clang -target i386-apple-iossimulator -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-IOSSIMULATOR
+// CHECK-ESAN-I386-IOSSIMULATOR: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-iossimulator'
+
+// RUN: %clang -target i386-apple-tvossimulator -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-TVOSSIMULATOR
+// RUN: %clang -target i386-apple-tvossimulator -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-I386-TVOSSIMULATOR
+// CHECK-ESAN-I386-TVOSSIMULATOR: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'i386-apple-tvossimulator'
+
+
+
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
+// RUN: %clang -target x86_64-apple-darwin10 -fvisibility=hidden -fsanitize=cfi -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi-derived-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-DCAST
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -fsanitize=cfi-unrelated-cast -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-UCAST
+// RUN: %clang -target x86_64-linux-gnu -flto -fvisibility=hidden -fsanitize=cfi-nvcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NVCALL
+// RUN: %clang -target x86_64-linux-gnu -flto -fvisibility=hidden -fsanitize=cfi-vcall -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-VCALL
 // CHECK-CFI: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast,cfi-icall,cfi-unrelated-cast,cfi-nvcall,cfi-vcall
 // CHECK-CFI-DCAST: -emit-llvm-bc{{.*}}-fsanitize=cfi-derived-cast
 // CHECK-CFI-UCAST: -emit-llvm-bc{{.*}}-fsanitize=cfi-unrelated-cast
 // CHECK-CFI-NVCALL: -emit-llvm-bc{{.*}}-fsanitize=cfi-nvcall
 // CHECK-CFI-VCALL: -emit-llvm-bc{{.*}}-fsanitize=cfi-vcall
 
-// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast -fno-lto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOLTO
+// RUN: %clang -target x86_64-linux-gnu -fvisibility=hidden -flto -fsanitize=cfi-derived-cast -fno-lto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOLTO
 // CHECK-CFI-NOLTO: '-fsanitize=cfi-derived-cast' only allowed with '-flto'
 
+// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS
+// CHECK-CFI-NOVIS: '-fsanitize=cfi-derived-cast' only allowed with '-fvisibility='
+
+// RUN: %clang -target x86_64-pc-win32 -flto -fsanitize=cfi-derived-cast -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS-NOERROR
+// RUN: echo > %t.o
+// RUN: %clang -target x86_64-linux-gnu -flto -fsanitize=cfi-derived-cast %t.o -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-NOVIS-NOERROR
+// CHECK-CFI-NOVIS-NOERROR-NOT: only allowed with
+
 // RUN: %clang -target mips-unknown-linux -fsanitize=cfi-icall %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-ICALL-MIPS
 // CHECK-CFI-ICALL-MIPS: unsupported option '-fsanitize=cfi-icall' for target 'mips-unknown-linux'
 
@@ -272,6 +361,9 @@
 // CHECK-CFI-NO-CROSS-DSO: -emit-llvm-bc
 // CHECK-CFI-NO-CROSS-DSO-NOT: -fsanitize-cfi-cross-dso
 
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=cfi -fsanitize-stats -flto -c %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-CFI-STATS
+// CHECK-CFI-STATS: -fsanitize-stats
+
 // RUN: %clang_cl -fsanitize=address -c -MDd -### -- %s 2>&1 | FileCheck %s -check-prefix=CHECK-ASAN-DEBUGRTL
 // RUN: %clang_cl -fsanitize=address -c -MTd -### -- %s 2>&1 | FileCheck %s -check-prefix=CHECK-ASAN-DEBUGRTL
 // RUN: %clang_cl -fsanitize=address -c -LDd -### -- %s 2>&1 | FileCheck %s -check-prefix=CHECK-ASAN-DEBUGRTL
@@ -293,21 +385,30 @@
 // RUN: %clang -fno-sanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NOSP
 // NOSP-NOT: "-fsanitize=safe-stack"
 
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target x86_64-linux-gnu -fsanitize=address,safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP-ASAN
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=address,safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP-ASAN
 // RUN: %clang -target x86_64-linux-gnu -fstack-protector -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=safe-stack -fstack-protector-all -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target arm-linux-androideabi -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// RUN: %clang -target aarch64-linux-android -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=SP
-// SP-NOT: stack-protector
+// RUN: %clang -target arm-linux-androideabi -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// RUN: %clang -target aarch64-linux-android -fsanitize=safe-stack -### %s 2>&1 | FileCheck %s -check-prefix=NO-SP
+// NO-SP-NOT: stack-protector
+// NO-SP: "-fsanitize=safe-stack"
 // SP: "-fsanitize=safe-stack"
-// SP-ASAN-NOT: stack-protector
-// SP-ASAN: "-fsanitize=address,safe-stack"
+// SP: -stack-protector
+// NO-SP-NOT: stack-protector
+
+// NO-SP-ASAN-NOT: stack-protector
+// NO-SP-ASAN: "-fsanitize=address,safe-stack"
+// NO-SP-ASAN-NOT: stack-protector
 
 // RUN: %clang -target powerpc64-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM
 // RUN: %clang -target powerpc64le-unknown-linux-gnu -fsanitize=memory %s -### 2>&1 | FileCheck %s -check-prefix=CHECK-SANM
 // CHECK-SANM: "-fsanitize=memory"
 
+// RUN: %clang -target aarch64-unknown-cloudabi -fsanitize=safe-stack %s -### 2>&1 | FileCheck %s -check-prefix=SAFESTACK-CLOUDABI
+// RUN: %clang -target x86_64-unknown-cloudabi -fsanitize=safe-stack %s -### 2>&1 | FileCheck %s -check-prefix=SAFESTACK-CLOUDABI
+// SAFESTACK-CLOUDABI: "-fsanitize=safe-stack"
+
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=function -fsanitize=undefined %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FSAN-UBSAN-PS4
 // CHECK-FSAN-UBSAN-PS4: unsupported option '-fsanitize=function' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=function %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-FSAN-PS4
@@ -320,6 +421,9 @@
 // CHECK-MSAN-PS4: unsupported option '-fsanitize=memory' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-PS4
 // CHECK-TSAN-PS4: unsupported option '-fsanitize=thread' for target 'x86_64-scei-ps4'
+// RUN: %clang -target x86_64-scei-ps4 -fsanitize=efficiency-cache-frag %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-PS4
+// RUN: %clang -target x86_64-scei-ps4 -fsanitize=efficiency-working-set %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ESAN-PS4
+// CHECK-ESAN-PS4: unsupported option '-fsanitize=efficiency-{{.*}}' for target 'x86_64-scei-ps4'
 // RUN: %clang -target x86_64-scei-ps4 -fsanitize=address %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-ASAN-PS4
 // Make sure there are no *.{o,bc} or -l passed before the ASan library.
 // CHECK-ASAN-PS4-NOT: {{(\.(o|bc)"? |-l).*-lSceDbgAddressSanitizer_stub_weak}}
diff --git a/test/Driver/fsjlj-exceptions.c b/test/Driver/fsjlj-exceptions.c
new file mode 100644
index 0000000000000..f44d5b3e1ef7d
--- /dev/null
+++ b/test/Driver/fsjlj-exceptions.c
@@ -0,0 +1,8 @@
+// RUN: %clang -target armv7-apple-ios -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-IOS %s
+// RUN: %clang -target i686-windows-gnu -fexceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-DEFAULT %s
+// RUN: %clang -target i686-windows-gnu -fexceptions -fsjlj-exceptions -c %s -o /dev/null -### 2>&1 | FileCheck -check-prefix CHECK-MINGW-SJLJ %s
+
+// CHECK-IOS: -fsjlj-exceptions
+// CHECK-MINGW-DEFAULT-NOT: -fsjlj-exceptions
+// CHECK-MINGW-SJLJ: -fsjlj-exceptions
+
diff --git a/test/Driver/fubsan-strip-path-components.cpp b/test/Driver/fubsan-strip-path-components.cpp
new file mode 100644
index 0000000000000..130024142f20d
--- /dev/null
+++ b/test/Driver/fubsan-strip-path-components.cpp
@@ -0,0 +1,2 @@
+// RUN: %clang %s -### -o %t.o -fsanitize-undefined-strip-path-components=42 2>&1 | FileCheck %s
+// CHECK: "-fsanitize-undefined-strip-path-components=42"
diff --git a/test/Driver/fuse-ld.c b/test/Driver/fuse-ld.c
index bd25b8deb3289..ca89eb997165a 100644
--- a/test/Driver/fuse-ld.c
+++ b/test/Driver/fuse-ld.c
@@ -1,4 +1,10 @@
 // RUN: %clang %s -### \
+// RUN:     -fuse-ld=/usr/local/bin/or1k-linux-ld 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=CHECK-ABSOLUTE-LD
+// CHECK-ABSOLUTE-LD: /usr/local/bin/or1k-linux-ld
+
+
+// RUN: %clang %s -### \
 // RUN:     -target x86_64-unknown-freebsd 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-FREEBSD-LD
 // CHECK-FREEBSD-LD: ld
diff --git a/test/Driver/gcc-toolchain.cpp b/test/Driver/gcc-toolchain.cpp
index aa0e078160ef8..ca96757a2bbcf 100644
--- a/test/Driver/gcc-toolchain.cpp
+++ b/test/Driver/gcc-toolchain.cpp
@@ -1,13 +1,13 @@
 // Test that gcc-toolchain option is working correctly
 //
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \
-// RUN:     --target=i386-unknown-linux \
+// RUN:     --target=i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     --gcc-toolchain=%S/Inputs/ubuntu_11.04_multiarch_tree/usr \
 // RUN:   | FileCheck %s
 //
 // Additionally check that the legacy spelling of the flag works.
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \
-// RUN:     --target=i386-unknown-linux \
+// RUN:     --target=i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     -gcc-toolchain %S/Inputs/ubuntu_11.04_multiarch_tree/usr \
 // RUN:   | FileCheck %s
 //
diff --git a/test/Driver/hexagon-toolchain-elf.c b/test/Driver/hexagon-toolchain-elf.c
index e3a54ddf861ff..827c19186b441 100644
--- a/test/Driver/hexagon-toolchain-elf.c
+++ b/test/Driver/hexagon-toolchain-elf.c
@@ -41,7 +41,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK012 %s
 // CHECK012: "-cc1"
-// CHECK012-DAG-NOT: "-internal-isystem"
+// CHECK012-NOT: "-internal-isystem"
 // CHECK012-DAG: "-internal-externc-isystem" "{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/include"
 
 // RUN: %clangxx -### -target hexagon-unknown-elf -fno-integrated-as    \
@@ -51,8 +51,8 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK013 %s
 // CHECK013: "-cc1"
-// CHECK013-DAG-NOT: "-internal-isystem"
-// CHECK013-DAG-NOT: "-internal-externc-isystem"
+// CHECK013-NOT: "-internal-isystem"
+// CHECK013-NOT: "-internal-externc-isystem"
 
 // -----------------------------------------------------------------------------
 // Test -mcpu=<cpuname> -mv<number>
@@ -63,7 +63,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK020 %s
 // CHECK020: "-cc1" {{.*}} "-target-cpu" "hexagonv4"
-// CHECK020: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v4/crt0
+// CHECK020: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v4/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -71,7 +71,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK021 %s
 // CHECK021: "-cc1" {{.*}} "-target-cpu" "hexagonv5"
-// CHECK021: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v5/crt0
+// CHECK021: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v5/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -79,7 +79,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK022 %s
 // CHECK022: "-cc1" {{.*}} "-target-cpu" "hexagonv55"
-// CHECK022: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v55/crt0
+// CHECK022: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v55/crt0
 
 // RUN: %clang -### -target hexagon-unknown-elf \
 // RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
@@ -87,7 +87,7 @@
 // RUN:   %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK023 %s
 // CHECK023: "-cc1" {{.*}} "-target-cpu" "hexagonv60"
-// CHECK023: "hexagon-link" {{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v60/crt0
+// CHECK023: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v60/crt0
 
 // -----------------------------------------------------------------------------
 // Test Linker related args
diff --git a/test/Driver/incompatible_sysroot.c b/test/Driver/incompatible_sysroot.c
new file mode 100644
index 0000000000000..876f1f058dc99
--- /dev/null
+++ b/test/Driver/incompatible_sysroot.c
@@ -0,0 +1,15 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang -target x86_64-apple-darwin -Wincompatible-sysroot -isysroot SDKs/MacOSX10.9.sdk -mios-version-min=9.0 -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-OSX-IOS %s
+// RUN: %clang -target arm64-apple-darwin -Wincompatible-sysroot -isysroot SDKs/iPhoneOS9.2.sdk -mwatchos-version-min=2.0 -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-IOS-WATCHOS %s
+// RUN: %clang -target arm64-apple-darwin -Wincompatible-sysroot -isysroot SDKs/iPhoneOS9.2.sdk -mtvos-version-min=9.0 -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-IOS-TVOS %s
+// RUN: %clang -target x86_64-apple-darwin -Wincompatible-sysroot -isysroot SDKs/iPhoneSimulator9.2.sdk -mios-version-min=9.0 -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-IOS-IOSSIM %s
+// RUN: %clang -target x86_64-apple-darwin -Wno-incompatible-sysroot -isysroot SDKs/MacOSX10.9.sdk -mios-version-min=9.0 -S -o - %s 2>&1 | FileCheck -check-prefix CHECK-OSX-IOS-DISABLED %s
+
+int main() { return 0; }
+// CHECK-OSX-IOS: warning: using sysroot for 'MacOSX' but targeting 'iPhone'
+// CHECK-IOS-WATCHOS: warning: using sysroot for 'iPhoneOS' but targeting 'Watch'
+// CHECK-IOS-TVOS: warning: using sysroot for 'iPhoneOS' but targeting 'AppleTV'
+// CHECK-IOS-IOSSIM-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
+// CHECK-OSX-IOS-DISABLED-NOT: warning: using sysroot for '{{.*}}' but targeting '{{.*}}'
diff --git a/test/Driver/lanai-toolchain.c b/test/Driver/lanai-toolchain.c
new file mode 100644
index 0000000000000..55236665a24fe
--- /dev/null
+++ b/test/Driver/lanai-toolchain.c
@@ -0,0 +1,2 @@
+// RUN: %clang -target lanai-unknown-unknown -v 2> %t
+// RUN: grep 'Target: lanai-unknown-unknown' %t
diff --git a/test/Driver/lanai-unknown-unknown.cpp b/test/Driver/lanai-unknown-unknown.cpp
new file mode 100644
index 0000000000000..5ce0adf9f3a46
--- /dev/null
+++ b/test/Driver/lanai-unknown-unknown.cpp
@@ -0,0 +1,86 @@
+// RUN: %clang -target lanai-unknown-unknown -### %s -emit-llvm-only -c 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=ECHO
+// RUN: %clang -target lanai-unknown-unknown %s -emit-llvm -S -o - \
+// RUN:   | FileCheck %s
+
+// ECHO: {{.*}} "-cc1" {{.*}}lanai-unknown-unknown.c
+
+typedef __builtin_va_list va_list;
+typedef __SIZE_TYPE__ size_t;
+typedef __PTRDIFF_TYPE__ ptrdiff_t;
+
+extern "C" {
+
+// CHECK: @align_c = global i32 1
+int align_c = __alignof(char);
+
+// CHECK: @align_s = global i32 2
+int align_s = __alignof(short);
+
+// CHECK: @align_i = global i32 4
+int align_i = __alignof(int);
+
+// CHECK: @align_l = global i32 4
+int align_l = __alignof(long);
+
+// CHECK: @align_ll = global i32 8
+int align_ll = __alignof(long long);
+
+// CHECK: @align_p = global i32 4
+int align_p = __alignof(void*);
+
+// CHECK: @align_vl = global i32 4
+int align_vl = __alignof(va_list);
+
+// Check types
+
+// CHECK: signext i8 @check_char()
+char check_char() { return 0; }
+
+// CHECK: signext i16 @check_short()
+short check_short() { return 0; }
+
+// CHECK: i32 @check_int()
+int check_int() { return 0; }
+
+// CHECK: i32 @check_long()
+long check_long() { return 0; }
+
+// CHECK: i64 @check_longlong()
+long long check_longlong() { return 0; }
+
+// CHECK: zeroext i8 @check_uchar()
+unsigned char check_uchar() { return 0; }
+
+// CHECK: zeroext i16 @check_ushort()
+unsigned short check_ushort() { return 0; }
+
+// CHECK: i32 @check_uint()
+unsigned int check_uint() { return 0; }
+
+// CHECK: i32 @check_ulong()
+unsigned long check_ulong() { return 0; }
+
+// CHECK: i64 @check_ulonglong()
+unsigned long long check_ulonglong() { return 0; }
+
+// CHECK: i32 @check_size_t()
+size_t check_size_t() { return 0; }
+
+}
+
+template<int> void Switch();
+template<> void Switch<4>();
+template<> void Switch<8>();
+template<> void Switch<16>();
+
+void check_pointer_size() {
+  // CHECK: SwitchILi4
+  Switch<sizeof(void*)>();
+
+  // CHECK: SwitchILi8
+  Switch<sizeof(long long)>();
+
+  // CHECK: SwitchILi4
+  Switch<sizeof(va_list)>();
+}
diff --git a/test/Driver/linker-opts.c b/test/Driver/linker-opts.c
index 24866a63b1fd9..29ef136c8b71f 100644
--- a/test/Driver/linker-opts.c
+++ b/test/Driver/linker-opts.c
@@ -1,3 +1,6 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+//
 // RUN: env LIBRARY_PATH=%T/test1 %clang -x c %s -### 2>&1 | FileCheck %s
 // CHECK: "-L{{.*}}/test1"
 
@@ -9,3 +12,12 @@
 // Make sure that LIBRARY_PATH works for both i386 and x86_64 on Darwin.
 // RUN: env LIBRARY_PATH=%T/test1 %clang -target x86_64-apple-darwin %s -### 2>&1 | FileCheck %s
 // RUN: env LIBRARY_PATH=%T/test1 %clang -target i386-apple-darwin  %s -### 2>&1 | FileCheck %s
+//
+// Make sure that we don't warn on unused compiler arguments.
+// RUN: %clang -Xclang -I. -x c %s -c -o %t/tmp.o
+// RUN: %clang -Xclang -I. %t/tmp.o -o %t/tmp -### 2>&1 | FileCheck %s --check-prefix=NO-UNUSED
+// NO-UNUSED-NOT: warning:{{.*}}unused
+//
+// Make sure that we do warn in other cases.
+// RUN: %clang %s -lfoo -c -o %t/tmp2.o -### 2>&1 | FileCheck %s --check-prefix=UNUSED
+// UNUSED: warning:{{.*}}unused
diff --git a/test/Driver/linux-header-search.cpp b/test/Driver/linux-header-search.cpp
index bd1da4976d12a..5f6ac504a036c 100644
--- a/test/Driver/linux-header-search.cpp
+++ b/test/Driver/linux-header-search.cpp
@@ -64,7 +64,7 @@
 //
 // Test a very broken version of multiarch that shipped in Ubuntu 11.04.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s
@@ -80,7 +80,7 @@
 // CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04 %s
@@ -97,7 +97,7 @@
 // CHECK-UBUNTU-13-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnux32 \
+// RUN:     -target x86_64-unknown-linux-gnux32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04 %s
@@ -114,7 +114,7 @@
 // CHECK-UBUNTU-14-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 ///
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target arm-linux-gnueabihf \
+// RUN:     -target arm-linux-gnueabihf -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04-CROSS %s
@@ -131,7 +131,7 @@
 //
 // Test Ubuntu/Debian's new version of multiarch, with -m32.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-13-04-M32 %s
@@ -145,7 +145,7 @@
 // Test Ubuntu/Debian's Ubuntu 14.04 config variant, with -m32
 // and an empty 4.9 directory.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-M32 %s
@@ -160,7 +160,7 @@
 // installed rather than relying on multilib. Also happens to look like an
 // actual i686 Ubuntu system.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu -m32 \
+// RUN:     -target x86_64-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree2 \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-I686 %s
@@ -173,7 +173,7 @@
 //
 // Test Ubuntu/Debian's Ubuntu 14.04 for powerpc64le
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc64le-unknown-linux-gnu -m32 \
+// RUN:     -target powerpc64le-unknown-linux-gnu -m32 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-UBUNTU-14-04-PPC64LE %s
@@ -189,7 +189,7 @@
 //
 // Thoroughly exercise the Debian multiarch environment.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target i686-linux-gnu \
+// RUN:     -target i686-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-X86 %s
@@ -205,7 +205,7 @@
 // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-linux-gnu \
+// RUN:     -target x86_64-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-X86-64 %s
@@ -221,7 +221,7 @@
 // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc-linux-gnu \
+// RUN:     -target powerpc-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-PPC %s
@@ -237,7 +237,7 @@
 // CHECK-DEBIAN-PPC: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-DEBIAN-PPC: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target powerpc64-linux-gnu \
+// RUN:     -target powerpc64-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_multiarch_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-PPC64 %s
@@ -256,7 +256,7 @@
 // Test Gentoo's weirdness both before and after they changed it in their GCC
 // 4.6.4 release.
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.6.2_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-6-2 %s
@@ -271,7 +271,7 @@
 // CHECK-GENTOO-4-6-2: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-GENTOO-4-6-2: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target x86_64-unknown-linux-gnu \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.6.4_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-6-4 %s
@@ -285,10 +285,25 @@
 // CHECK-GENTOO-4-6-4: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include"
 // CHECK-GENTOO-4-6-4: "-internal-externc-isystem" "[[SYSROOT]]/include"
 // CHECK-GENTOO-4-6-4: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
+// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
+// RUN:     -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \
+// RUN:     --sysroot=%S/Inputs/gentoo_linux_gcc_4.9.3_tree \
+// RUN:     --gcc-toolchain="" \
+// RUN:   | FileCheck --check-prefix=CHECK-GENTOO-4-9-3 %s
+// CHECK-GENTOO-4-9-3: "{{.*}}clang{{.*}}" "-cc1"
+// CHECK-GENTOO-4-9-3: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-GENTOO-4-9-3: "-isysroot" "[[SYSROOT:[^"]+]]"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/x86_64-pc-linux-gnu"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-pc-linux-gnu/4.9.3/include/g++-v4.9.3/backward"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[SYSROOT]]/usr/local/include"
+// CHECK-GENTOO-4-9-3: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include"
+// CHECK-GENTOO-4-9-3: "-internal-externc-isystem" "[[SYSROOT]]/include"
+// CHECK-GENTOO-4-9-3: "-internal-externc-isystem" "[[SYSROOT]]/usr/include"
 //
 // Check header search on Debian 6 / MIPS64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target mips64-unknown-linux-gnuabi64 \
+// RUN:     -target mips64-unknown-linux-gnuabi64 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64-GNUABI %s
@@ -306,7 +321,7 @@
 //
 // Check header search on Debian 6 / MIPS64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target mips64el-unknown-linux-gnuabi64 \
+// RUN:     -target mips64el-unknown-linux-gnuabi64 -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_6_mips64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-GNUABI %s
@@ -324,7 +339,7 @@
 
 // Check header search on Debian 8 / Sparc
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc-unknown-linux-gnu \
+// RUN:     -target sparc-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc_multilib_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC32 %s
@@ -342,7 +357,7 @@
 
 // Check header search on Debian 8 / Sparc, with the oldstyle multilib packages
 // RUN: %clang -no-canonical-prefixes -m64 %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc-unknown-linux-gnu \
+// RUN:     -target sparc-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc_multilib_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC32-LIB64 %s
@@ -363,7 +378,7 @@
 
 // Check header search on Debian 8 / Sparc64
 // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \
-// RUN:     -target sparc64-unknown-linux-gnu \
+// RUN:     -target sparc64-unknown-linux-gnu -stdlib=libstdc++ \
 // RUN:     --sysroot=%S/Inputs/debian_8_sparc64_tree \
 // RUN:     --gcc-toolchain="" \
 // RUN:   | FileCheck --check-prefix=CHECK-DEBIAN-SPARC64 %s
diff --git a/test/Driver/linux-ld.c b/test/Driver/linux-ld.c
index c15e24d294a98..f9f4b482c920d 100644
--- a/test/Driver/linux-ld.c
+++ b/test/Driver/linux-ld.c
@@ -388,6 +388,15 @@
 // CHECK-GCC-VERSION4: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-GCC-VERSION4: "{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99{{/|\\\\}}crtbegin.o"
 // CHECK-GCC-VERSION4: "-L{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99"
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-unknown-linux -m32 \
+// RUN:     -ccc-install-dir %S/Inputs/gcc_version_parsing5/bin \
+// RUN:     --gcc-toolchain="" \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-GCC-VERSION5 %s
+// CHECK-GCC-VERSION5: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
+// CHECK-GCC-VERSION5: "{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5{{/|\\\\}}crtbegin.o"
+// CHECK-GCC-VERSION5: "-L{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5"
 //
 // Test a simulated installation of libc++ on Linux, both through sysroot and
 // the installation path of Clang.
@@ -474,7 +483,7 @@
 // RUN:     --sysroot=%S/Inputs/x86-64_ubuntu_13.10 \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64-UBUNTU-13-10-ARM-HF %s
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-UBUNTU-13-10-ARM-HF: "-dynamic-linker" "/lib/ld-linux-armhf.so.3"
+// CHECK-X86-64-UBUNTU-13-10-ARM-HF: "-dynamic-linker" "{{(/usr/arm--linux-gnueabihf)?}}/lib/ld-linux-armhf.so.3"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8/../../../../arm-linux-gnueabihf/lib/../lib{{/|\\\\}}crt1.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8/../../../../arm-linux-gnueabihf/lib/../lib{{/|\\\\}}crti.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM-HF: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabihf/4.8{{/|\\\\}}crtbegin.o"
@@ -493,7 +502,7 @@
 // RUN:     --sysroot=%S/Inputs/x86-64_ubuntu_13.10 \
 // RUN:   | FileCheck --check-prefix=CHECK-X86-64-UBUNTU-13-10-ARM %s
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
-// CHECK-X86-64-UBUNTU-13-10-ARM: "-dynamic-linker" "/lib/ld-linux.so.3"
+// CHECK-X86-64-UBUNTU-13-10-ARM: "-dynamic-linker" "{{(/usr/arm--linux-gnueabi)?}}/lib/ld-linux.so.3"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7/../../../../arm-linux-gnueabi/lib/../lib{{/|\\\\}}crt1.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7/../../../../arm-linux-gnueabi/lib/../lib{{/|\\\\}}crti.o"
 // CHECK-X86-64-UBUNTU-13-10-ARM: "{{.*}}/usr/lib/gcc-cross/arm-linux-gnueabi/4.7{{/|\\\\}}crtbegin.o"
@@ -636,7 +645,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64 %s
 // CHECK-PPC64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64: "-m" "elf64ppc"
-// CHECK-PPC64: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64-linux-gnu -mabi=elfv1 \
@@ -646,35 +655,35 @@
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64-ELFv1 %s
 // CHECK-PPC64-ELFv1: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64-ELFv1: "-m" "elf64ppc"
-// CHECK-PPC64-ELFv1: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64-ELFv1: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64-linux-gnu -mabi=elfv2 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64-ELFv2 %s
 // CHECK-PPC64-ELFv2: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64-ELFv2: "-m" "elf64ppc"
-// CHECK-PPC64-ELFv2: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64-ELFv2: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE %s
 // CHECK-PPC64LE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE: "-m" "elf64lppc"
-// CHECK-PPC64LE: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64LE: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu -mabi=elfv1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE-ELFv1 %s
 // CHECK-PPC64LE-ELFv1: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE-ELFv1: "-m" "elf64lppc"
-// CHECK-PPC64LE-ELFv1: "-dynamic-linker" "{{.*}}/lib64/ld64.so.1"
+// CHECK-PPC64LE-ELFv1: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.1"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=powerpc64le-linux-gnu -mabi=elfv2 \
 // RUN:   | FileCheck --check-prefix=CHECK-PPC64LE-ELFv2 %s
 // CHECK-PPC64LE-ELFv2: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PPC64LE-ELFv2: "-m" "elf64lppc"
-// CHECK-PPC64LE-ELFv2: "-dynamic-linker" "{{.*}}/lib64/ld64.so.2"
+// CHECK-PPC64LE-ELFv2: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld64.so.2"
 //
 // Check that we do not pass --hash-style=gnu and --hash-style=both to linker
 // and provide correct path to the dynamic linker and emulation mode when build
@@ -714,7 +723,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64 %s
 // CHECK-MIPS64: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64: "-m" "elf64btsmip"
-// CHECK-MIPS64: "-dynamic-linker" "{{.*}}/lib64/ld.so.1"
+// CHECK-MIPS64: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
 // CHECK-MIPS64-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -722,21 +731,21 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL %s
 // CHECK-MIPS64EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL: "-m" "elf64ltsmip"
-// CHECK-MIPS64EL: "-dynamic-linker" "{{.*}}/lib64/ld.so.1"
+// CHECK-MIPS64EL: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
 // CHECK-MIPS64EL-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mnan=2008 \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-NAN2008 %s
 // CHECK-MIPS64EL-NAN2008: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-NAN2008: "-m" "elf64ltsmip"
-// CHECK-MIPS64EL-NAN2008: "-dynamic-linker" "{{.*}}/lib64/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64EL-NAN2008: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64EL-NAN2008-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mcpu=mips64r6 \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64R6EL %s
 // CHECK-MIPS64R6EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64R6EL: "-m" "elf64ltsmip"
-// CHECK-MIPS64R6EL: "-dynamic-linker" "{{.*}}/lib64/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64R6EL: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64R6EL-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -744,7 +753,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64-N32 %s
 // CHECK-MIPS64-N32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64-N32: "-m" "elf32btsmipn32"
-// CHECK-MIPS64-N32: "-dynamic-linker" "{{.*}}/lib32/ld.so.1"
+// CHECK-MIPS64-N32: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld.so.1"
 // CHECK-MIPS64-N32-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
@@ -752,36 +761,44 @@
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-N32 %s
 // CHECK-MIPS64EL-N32: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-N32: "-m" "elf32ltsmipn32"
-// CHECK-MIPS64EL-N32: "-dynamic-linker" "{{.*}}/lib32/ld.so.1"
+// CHECK-MIPS64EL-N32: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld.so.1"
 // CHECK-MIPS64EL-N32-NOT: "--hash-style={{gnu|both}}"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-linux-gnu -mabi=n32 \
 // RUN:   -mnan=2008 | FileCheck --check-prefix=CHECK-MIPS64EL-N32-NAN2008 %s
 // CHECK-MIPS64EL-N32-NAN2008: "{{.*}}ld{{(.exe)?}}"
 // CHECK-MIPS64EL-N32-NAN2008: "-m" "elf32ltsmipn32"
-// CHECK-MIPS64EL-N32-NAN2008: "-dynamic-linker" "{{.*}}/lib32/ld-linux-mipsn8.so.1"
+// CHECK-MIPS64EL-N32-NAN2008: "-dynamic-linker" "{{.*}}/lib{{(32)?}}/ld-linux-mipsn8.so.1"
 // CHECK-MIPS64EL-N32-NAN2008-NOT: "--hash-style={{gnu|both}}"
 //
+// RUN: %clang %s -### -o %t.o 2>&1 --target=mips64el-redhat-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-MIPS64EL-REDHAT %s
+// CHECK-MIPS64EL-REDHAT: "{{.*}}ld{{(.exe)?}}"
+// CHECK-MIPS64EL-REDHAT: "-m" "elf64ltsmip"
+// CHECK-MIPS64EL-REDHAT: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld.so.1"
+// CHECK-MIPS64EL-REDHAT-NOT: "-dynamic-linker" "{{.*}}/lib{{(64)?}}/ld-musl-mipsel.so.1"
+// CHECK-MIPS64EL-REDHAT-NOT: "--hash-style={{gnu|both}}"
+//
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparc-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV8 %s
 // CHECK-SPARCV8: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV8: "-m" "elf32_sparc"
-// CHECK-SPARCV8: "-dynamic-linker" "/lib/ld-linux.so.2"
+// CHECK-SPARCV8: "-dynamic-linker" "{{(/usr/sparc-unknown-linux-gnu)?}}/lib/ld-linux.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparcel-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV8EL %s
 // CHECK-SPARCV8EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV8EL: "-m" "elf32_sparc"
-// CHECK-SPARCV8EL: "-dynamic-linker" "/lib/ld-linux.so.2"
+// CHECK-SPARCV8EL: "-dynamic-linker" "{{(/usr/sparcel-unknown-linux-gnu)?}}/lib/ld-linux.so.2"
 //
 // RUN: %clang %s -### -o %t.o 2>&1 \
 // RUN:     --target=sparcv9-unknown-linux-gnu \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARCV9 %s
 // CHECK-SPARCV9: "{{.*}}ld{{(.exe)?}}"
 // CHECK-SPARCV9: "-m" "elf64_sparc"
-// CHECK-SPARCV9: "-dynamic-linker" "/lib64/ld-linux.so.2"
+// CHECK-SPARCV9: "-dynamic-linker" "{{(/usr/sparcv9-unknown-linux-gnu)?}}/lib{{(64)?}}/ld-linux.so.2"
 //
 // Thoroughly exercise the Debian multiarch environment.
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
@@ -1553,7 +1570,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMEB %s
 // CHECK-ARMEB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-ARMEB-NOT: "--be8"
-// CHECK-ARMEB: "-m" "armebelf_linux_eabi"
+// CHECK-ARMEB: "-m" "armelfb_linux_eabi"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=armebv7-unknown-linux \
@@ -1562,4 +1579,74 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
 // CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]"
 // CHECK-ARMV7EB: "--be8"
-// CHECK-ARMV7EB: "-m" "armebelf_linux_eabi"
+// CHECK-ARMV7EB: "-m" "armelfb_linux_eabi"
+
+// Check dynamic-linker for musl-libc
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=i386-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-X86 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=x86_64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-X86_64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mipsel-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPSEL %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=mips64el-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-MIPS64EL %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=powerpc-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-PPC %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=powerpc64-pc-linux-musl \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-PPC64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARM %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbeb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEB %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=thumbeb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=arm-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARM %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=arm-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armeb-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEB %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=armeb-pc-linux-musleabihf \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-ARMEBHF %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=aarch64-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64 %s
+// RUN: %clang %s -### -o %t.o 2>&1 \
+// RUN:     --target=aarch64_be-pc-linux-musleabi \
+// RUN:   | FileCheck --check-prefix=CHECK-MUSL-AARCH64_BE %s
+// CHECK-MUSL-X86:        "-dynamic-linker" "/lib/ld-musl-i386.so.1"
+// CHECK-MUSL-X86_64:     "-dynamic-linker" "/lib/ld-musl-x86_64.so.1"
+// CHECK-MUSL-MIPS:       "-dynamic-linker" "/lib/ld-musl-mips.so.1"
+// CHECK-MUSL-MIPSEL:     "-dynamic-linker" "/lib/ld-musl-mipsel.so.1"
+// CHECK-MUSL-MIPS64:     "-dynamic-linker" "/lib/ld-musl-mips64.so.1"
+// CHECK-MUSL-MIPS64EL:   "-dynamic-linker" "/lib/ld-musl-mips64el.so.1"
+// CHECK-MUSL-PPC:        "-dynamic-linker" "/lib/ld-musl-powerpc.so.1"
+// CHECK-MUSL-PPC64:      "-dynamic-linker" "/lib/ld-musl-powerpc64.so.1"
+// CHECK-MUSL-ARM:        "-dynamic-linker" "/lib/ld-musl-arm.so.1"
+// CHECK-MUSL-ARMHF:      "-dynamic-linker" "/lib/ld-musl-armhf.so.1"
+// CHECK-MUSL-ARMEB:      "-dynamic-linker" "/lib/ld-musl-armeb.so.1"
+// CHECK-MUSL-ARMEBHF:    "-dynamic-linker" "/lib/ld-musl-armebhf.so.1"
+// CHECK-MUSL-AARCH64:    "-dynamic-linker" "/lib/ld-musl-aarch64.so.1"
+// CHECK-MUSL-AARCH64_BE: "-dynamic-linker" "/lib/ld-musl-aarch64_be.so.1"
diff --git a/test/Driver/lit.local.cfg b/test/Driver/lit.local.cfg
index 6c2373bd21789..ff831e7fe4882 100644
--- a/test/Driver/lit.local.cfg
+++ b/test/Driver/lit.local.cfg
@@ -1,5 +1,5 @@
 config.suffixes = ['.c', '.cpp', '.h', '.m', '.mm', '.S', '.s', '.f90', '.f95',
-                   '.cu']
+                   '.cu', '.rs', '.cl']
 config.substitutions = list(config.substitutions)
 config.substitutions.insert(0,
     ('%clang_cc1',
diff --git a/test/Driver/lto.c b/test/Driver/lto.c
index 3f66274ee6fc8..d2f68f571afd2 100644
--- a/test/Driver/lto.c
+++ b/test/Driver/lto.c
@@ -49,3 +49,12 @@
 // RUN: FileCheck -check-prefix=CHECK-LINK-NOLTO-ACTION < %t %s
 //
 // CHECK-LINK-NOLTO-ACTION-NOT: "-plugin" "{{.*}}/LLVMgold.so"
+
+// -flto passes along an explicit debugger tuning argument.
+// RUN: %clang -target x86_64-unknown-linux -### %s -flto -glldb 2> %t
+// RUN: FileCheck -check-prefix=CHECK-TUNING-LLDB < %t %s
+// RUN: %clang -target x86_64-unknown-linux -### %s -flto -g 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-TUNING < %t %s
+//
+// CHECK-TUNING-LLDB:   "-plugin-opt=-debugger-tune=lldb"
+// CHECK-NO-TUNING-NOT: "-plugin-opt=-debugger-tune
diff --git a/test/Driver/miamcu-opt.c b/test/Driver/miamcu-opt.c
new file mode 100644
index 0000000000000..7f96998837e9a
--- /dev/null
+++ b/test/Driver/miamcu-opt.c
@@ -0,0 +1,36 @@
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+//
+// RUN: %clang -miamcu -no-canonical-prefixes %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -no-canonical-prefixes -m32 %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -no-canonical-prefixes -target x86_64-unknown-linux-gnu %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -mno-iamcu -miamcu -no-canonical-prefixes %s -### -o %t.o 2>&1 | FileCheck %s
+// RUN: %clang -miamcu -no-canonical-prefixes -m64 %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=M64
+// RUN: %clang -miamcu -no-canonical-prefixes -dynamic %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=DYNAMIC
+// RUN: %clang -miamcu -no-canonical-prefixes  -target armv8-eabi %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=NOT-X86
+// RUN: %clang -miamcu -mno-iamcu -no-canonical-prefixes -target x86_64-unknown-linux-gnu %s -### -o %t.o 2>&1 | FileCheck %s -check-prefix=MNOIAMCU
+
+// M64: error: invalid argument '-miamcu' not allowed with '-m64'
+
+// DYNAMIC: error: invalid argument '-dynamic' not allowed with '-static'
+
+// NOT-X86: error: unsupported option '-miamcu' for target 'armv8---eabi'
+
+// MNOIAMCU-NOT: "-triple" "i586-intel-elfiamcu"
+
+// CHECK: "{{.*}}clang{{.*}}" "-cc1"
+// CHECK: "-triple" "i586-intel-elfiamcu"
+// CHECK: "-static-define"
+// CHECK: "-mfloat-abi" "soft"
+// CHECK: "-mstack-alignment=4"
+
+// CHECK: "{{.*}}ld{{(.exe)?}}"
+// CHECK: "-m" "elf_iamcu"
+// CHECK: "-static"
+// CHECK-NOT: crt1
+// CHECK-NOT: crti
+// CHECK-NOT: ctrbegin
+// CHECK: crt0
+// CHECK: "--start-group" "-lgcc" "-lc" "-lgloss" "--end-group" "--as-needed" "-lsoftfp" "--no-as-needed"
+// CHECK-NOT: crtend
+// CHECK-NOT: ctrn
diff --git a/test/Driver/miamcu-opt.cpp b/test/Driver/miamcu-opt.cpp
new file mode 100644
index 0000000000000..6c8d7552bc133
--- /dev/null
+++ b/test/Driver/miamcu-opt.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang -miamcu %s -### -o %t.o 2>&1 | FileCheck %s
+
+// CHECK: error: the clang compiler does not support 'C++ for IAMCU'
diff --git a/test/Driver/mips-abi.c b/test/Driver/mips-abi.c
index cede6850a023d..8e3f7c0cd12c3 100644
--- a/test/Driver/mips-abi.c
+++ b/test/Driver/mips-abi.c
@@ -1,14 +1,38 @@
 // Check passing Mips ABI options to the backend.
 //
 // RUN: %clang -target mips-linux-gnu -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-DEF %s
-// MIPS-DEF: "-target-cpu" "mips32r2"
-// MIPS-DEF: "-target-abi" "o32"
+// RUN:   | FileCheck -check-prefix=MIPS32R2-O32 %s
+// RUN: %clang -target mips64-linux-gnu -mips32r2 -mabi=32 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS32R2-O32 %s
+// MIPS32R2-O32: "-target-cpu" "mips32r2"
+// MIPS32R2-O32: "-target-abi" "o32"
+//
+// FIXME: This is a valid combination of options but we reject it at the moment
+//        because the backend can't handle it.
+// RUN: not %clang -target mips-linux-gnu -c %s \
+// RUN:        -march=mips64r2 -mabi=32 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-O32 %s
+// MIPS64R2-O32: error: ABI 'o32' is not supported on CPU 'mips64r2'
 //
 // RUN: %clang -target mips64-linux-gnu -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS64-DEF %s
-// MIPS64-DEF: "-target-cpu" "mips64r2"
-// MIPS64-DEF: "-target-abi" "n64"
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-img-linux-gnu -mips64r2 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-mti-linux-gnu -mips64r2 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// RUN: %clang -target mips-linux-gnu -mips64r2 -mabi=64 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R2-N64 %s
+// MIPS64R2-N64: "-target-cpu" "mips64r2"
+// MIPS64R2-N64: "-target-abi" "n64"
+//
+// RUN: %clang -target mips64-linux-gnu -### -mips64r3 -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// RUN: %clang -target mips-img-linux-gnu -mips64r3 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// RUN: %clang -target mips-mti-linux-gnu -mips64r3 -### -c %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS64R3-N64 %s
+// MIPS64R3-N64: "-target-cpu" "mips64r3"
+// MIPS64R3-N64: "-target-abi" "n64"
 //
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:        -mabi=32 2>&1 \
@@ -45,12 +69,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS-ABI-O64 %s
 // MIPS-ABI-O64: error: unknown target ABI 'o64'
 //
-// RUN: %clang -target mips-linux-gnu -### -c %s \
-// RUN:        -mabi=eabi 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-ABI-EABI %s
-// MIPS-ABI-EABI: "-target-cpu" "mips32r2"
-// MIPS-ABI-EABI: "-target-abi" "eabi"
-//
 // RUN: not %clang -target mips-linux-gnu -c %s \
 // RUN:        -mabi=unknown 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ABI-UNKNOWN %s
@@ -104,6 +122,11 @@
 // MIPS-ARCH-P5600: "-target-cpu" "p5600"
 // MIPS-ARCH-P5600: "-target-abi" "o32"
 //
+// RUN: not %clang -target mips-linux-gnu -c %s \
+// RUN:        -march=p5600 -mabi=64 2>&1 \
+// RUN:   | FileCheck -check-prefix=MIPS-ARCH-P5600-N64 %s
+// MIPS-ARCH-P5600-N64: error: ABI 'n64' is not supported on CPU 'p5600'
+//
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:        -march=mips64 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ARCH-3264 %s
@@ -131,7 +154,7 @@
 // RUN: not %clang -target mips64-linux-gnu -c %s \
 // RUN:        -march=mips32 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-ARCH-6432 %s
-// MIPS-ARCH-6432: error: unknown target CPU 'mips32'
+// MIPS-ARCH-6432: error: ABI 'n64' is not supported on CPU 'mips32'
 //
 // RUN: not %clang -target mips-linux-gnu -c %s \
 // RUN:        -march=unknown 2>&1 \
diff --git a/test/Driver/mips-as.c b/test/Driver/mips-as.c
index 63fc64c3f74ab..4d956208f0642 100644
--- a/test/Driver/mips-as.c
+++ b/test/Driver/mips-as.c
@@ -30,11 +30,6 @@
 // RUN:   | FileCheck -check-prefix=MIPS64R2-DEF-EL-AS %s
 // MIPS64R2-DEF-EL-AS: as{{(.exe)?}}" "-march" "mips64r2" "-mabi" "64"  "-mno-shared" "-KPIC" "-EL"
 //
-// RUN: %clang -target mips-linux-gnu -mabi=eabi -### \
-// RUN:   -no-integrated-as -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=MIPS-EABI %s
-// MIPS-EABI: as{{(.exe)?}}" "-march" "mips32r2" "-mabi" "eabi" "-mno-shared" "-call_nonpic" "-EB"
-//
 // RUN: %clang -target mips64-linux-gnu -mabi=n32 -### \
 // RUN:   -no-integrated-as -c %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=MIPS-N32 %s
diff --git a/test/Driver/mips-cs.cpp b/test/Driver/mips-cs.cpp
index 62a90f0b9110f..bca2ab9fa26ff 100644
--- a/test/Driver/mips-cs.cpp
+++ b/test/Driver/mips-cs.cpp
@@ -3,7 +3,7 @@
 // = Big-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
 // CHECK-BE-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -31,7 +31,7 @@
 // = Big-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32 %s
 // CHECK-BE-UC-HF-32: "-internal-isystem"
 // CHECK-BE-UC-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -60,7 +60,7 @@
 // = Big-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
 // CHECK-BE-HF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -89,7 +89,7 @@
 // = Big-endian, hard float, mmicromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MICRO %s
 // CHECK-BE-HF-MICRO: "-internal-isystem"
 // CHECK-BE-HF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -118,7 +118,7 @@
 // = Big-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-NAN %s
 // CHECK-BE-HF-NAN: "-internal-isystem"
 // CHECK-BE-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -147,7 +147,7 @@
 // = Big-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-NAN %s
 // CHECK-BE-UC-HF-NAN: "-internal-isystem"
 // CHECK-BE-UC-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -176,7 +176,7 @@
 // = Big-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
 // CHECK-BE-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -205,7 +205,7 @@
 // = Big-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -muclibc -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32 %s
 // CHECK-BE-UC-SF-32: "-internal-isystem"
 // CHECK-BE-UC-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -234,7 +234,7 @@
 // = Big-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
 // CHECK-BE-SF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -263,7 +263,7 @@
 // = Big-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-linux-gnu -msoft-float -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MICRO %s
 // CHECK-BE-SF-MICRO: "-internal-isystem"
 // CHECK-BE-SF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -292,7 +292,7 @@
 // = Big-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64 %s
 // CHECK-BE-HF-64: "-internal-isystem"
 // CHECK-BE-HF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -321,7 +321,7 @@
 // = Big-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64 %s
 // CHECK-BE-SF-64: "-internal-isystem"
 // CHECK-BE-SF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -350,7 +350,7 @@
 // = Little-endian, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
 // CHECK-EL-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -379,7 +379,7 @@
 // = Little-endian, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32 %s
 // CHECK-EL-UC-HF-32: "-internal-isystem"
 // CHECK-EL-UC-HF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -408,7 +408,7 @@
 // = Little-endian, hard float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mips16 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
 // CHECK-EL-HF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -437,7 +437,7 @@
 // = Little-endian, hard float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mmicromips \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MICRO %s
 // CHECK-EL-HF-MICRO: "-internal-isystem"
 // CHECK-EL-HF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -466,7 +466,7 @@
 // = Little-endian, hard float, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-NAN %s
 // CHECK-EL-HF-NAN: "-internal-isystem"
 // CHECK-EL-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -495,7 +495,7 @@
 // = Little-endian, hard float, uclibc, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -muclibc -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-NAN %s
 // CHECK-EL-UC-HF-NAN: "-internal-isystem"
 // CHECK-EL-UC-HF-NAN: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -524,7 +524,7 @@
 // = Little-endian, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
 // CHECK-EL-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -553,7 +553,7 @@
 // = Little-endian, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mfloat-abi=soft -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32 %s
 // CHECK-EL-UC-SF-32: "-internal-isystem"
 // CHECK-EL-UC-SF-32: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -582,7 +582,7 @@
 // = Little-endian, soft float, mips16
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
 // CHECK-EL-SF-16: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -611,7 +611,7 @@
 // = Little-endian, soft float, micromips
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MICRO %s
 // CHECK-EL-SF-MICRO: "-internal-isystem"
 // CHECK-EL-SF-MICRO: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -640,7 +640,7 @@
 // = Little-endian, hard float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64el-linux-gnu \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64 %s
 // CHECK-EL-HF-64: "-internal-isystem"
 // CHECK-EL-HF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
@@ -669,7 +669,7 @@
 // = Little-endian, soft float, 64-bit
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64el-linux-gnu -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_cs_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_cs_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64 %s
 // CHECK-EL-SF-64: "-internal-isystem"
 // CHECK-EL-SF-64: "[[TC:[^"]+/lib/gcc/mips-linux-gnu/4.6.3]]/../../../../mips-linux-gnu/include/c++/4.6.3"
diff --git a/test/Driver/mips-features.c b/test/Driver/mips-features.c
index 461d778182bed..69fc20e1f245e 100644
--- a/test/Driver/mips-features.c
+++ b/test/Driver/mips-features.c
@@ -116,6 +116,24 @@
 // RUN:   | FileCheck --check-prefix=CHECK-NANLEGACY %s
 // CHECK-NANLEGACY: "-target-feature" "-nan2008"
 //
+// -mcompact-branches=never
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=never 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBNEVER %s
+// CHECK-CBNEVER: "-mllvm" "-mips-compact-branches=never"
+//
+// -mcompact-branches=optimal
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=optimal 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBOPTIMAL %s
+// CHECK-CBOPTIMAL: "-mllvm" "-mips-compact-branches=optimal"
+//
+// -mcompact-branches=always
+// RUN: %clang -target mips-linux-gnu -march=mips32r6 -### -c %s \
+// RUN:     -mcompact-branches=always 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-CBALWAYS %s
+// CHECK-CBALWAYS: "-mllvm" "-mips-compact-branches=always"
+//
 // -mxgot
 // RUN: %clang -target mips-linux-gnu -### -c %s \
 // RUN:     -mno-xgot -mxgot 2>&1 \
diff --git a/test/Driver/mips-fsf.cpp b/test/Driver/mips-fsf.cpp
index e39b24e4ab2ca..68ee490a88ac6 100644
--- a/test/Driver/mips-fsf.cpp
+++ b/test/Driver/mips-fsf.cpp
@@ -2,8 +2,8 @@
 //
 // = Big-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32 %s
 // CHECK-BE-HF-32: "-internal-isystem"
 // CHECK-BE-HF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -29,8 +29,8 @@
 //
 // = Big-endian, mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32 %s
 // CHECK-BE-HF64-32: "-internal-isystem"
 // CHECK-BE-HF64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -56,8 +56,8 @@
 //
 // = Big-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32 %s
 // CHECK-BE-SF-32: "-internal-isystem"
 // CHECK-BE-SF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -83,8 +83,8 @@
 //
 // = Big-endian, mips16 / mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16 %s
 // CHECK-BE-HF-16: "-internal-isystem"
 // CHECK-BE-HF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -110,8 +110,8 @@
 //
 // = Big-endian, mips16 / mips32, hard float, fp64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16 %s
 // CHECK-BE-HF64-16: "-internal-isystem"
 // CHECK-BE-HF64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -137,8 +137,8 @@
 //
 // = Big-endian, mips16 / mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16 %s
 // CHECK-BE-SF-16: "-internal-isystem"
 // CHECK-BE-SF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -164,8 +164,8 @@
 //
 // = Big-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16 %s
 // CHECK-BE-NAN-16: "-internal-isystem"
 // CHECK-BE-NAN-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -191,8 +191,8 @@
 //
 // = Big-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16 %s
 // CHECK-BE-NAN64-16: "-internal-isystem"
 // CHECK-BE-NAN64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -218,8 +218,8 @@
 //
 // = Big-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32 %s
 // CHECK-BE-NAN-32: "-internal-isystem"
 // CHECK-BE-NAN-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -245,8 +245,8 @@
 //
 // = Big-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32 %s
 // CHECK-BE-NAN64-32: "-internal-isystem"
 // CHECK-BE-NAN64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -272,8 +272,8 @@
 //
 // = Big-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R2 %s
 // CHECK-BE-HF-32R2: "-internal-isystem"
 // CHECK-BE-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -299,8 +299,8 @@
 //
 // = Big-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-HF-32R2 %s
 // CHECK-BE-UC-HF-32R2: "-internal-isystem"
 // CHECK-BE-UC-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -326,8 +326,8 @@
 //
 // = Big-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-32R2 %s
 // CHECK-BE-HF64-32R2: "-internal-isystem"
 // CHECK-BE-HF64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -353,8 +353,8 @@
 //
 // = Big-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-32R2 %s
 // CHECK-BE-SF-32R2: "-internal-isystem"
 // CHECK-BE-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -380,8 +380,8 @@
 //
 // = Big-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -msoft-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-SF-32R2 %s
 // CHECK-BE-UC-SF-32R2: "-internal-isystem"
 // CHECK-BE-UC-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -407,8 +407,8 @@
 //
 // = Big-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-16R2 %s
 // CHECK-BE-HF-16R2: "-internal-isystem"
 // CHECK-BE-HF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -434,8 +434,8 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-16R2 %s
 // CHECK-BE-HF64-16R2: "-internal-isystem"
 // CHECK-BE-HF64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -461,8 +461,8 @@
 //
 // = Big-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-16R2 %s
 // CHECK-BE-SF-16R2: "-internal-isystem"
 // CHECK-BE-SF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -488,8 +488,8 @@
 //
 // = Big-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-16R2 %s
 // CHECK-BE-NAN-16R2: "-internal-isystem"
 // CHECK-BE-NAN-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -515,8 +515,8 @@
 //
 // = Big-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-16R2 %s
 // CHECK-BE-NAN64-16R2: "-internal-isystem"
 // CHECK-BE-NAN64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -542,8 +542,8 @@
 //
 // = Big-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-32R2 %s
 // CHECK-BE-NAN-32R2: "-internal-isystem"
 // CHECK-BE-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -569,8 +569,8 @@
 //
 // = Big-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mnan=2008 -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-UC-NAN-32R2 %s
 // CHECK-BE-UC-NAN-32R2: "-internal-isystem"
 // CHECK-BE-UC-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -596,8 +596,8 @@
 //
 // = Big-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2 %s
 // CHECK-BE-NAN64-32R2: "-internal-isystem"
 // CHECK-BE-NAN64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -623,8 +623,8 @@
 //
 // = Big-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-32R2-DEF %s
 // CHECK-BE-NAN64-32R2-DEF: "-internal-isystem"
 // CHECK-BE-NAN64-32R2-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -650,8 +650,8 @@
 //
 // = Big-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-MM %s
 // CHECK-BE-HF-MM: "-internal-isystem"
 // CHECK-BE-HF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -677,8 +677,8 @@
 //
 // = Big-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-MM %s
 // CHECK-BE-HF64-MM: "-internal-isystem"
 // CHECK-BE-HF64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -704,8 +704,8 @@
 //
 // = Big-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-MM %s
 // CHECK-BE-SF-MM: "-internal-isystem"
 // CHECK-BE-SF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -731,8 +731,8 @@
 //
 // = Big-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-MM %s
 // CHECK-BE-NAN-MM: "-internal-isystem"
 // CHECK-BE-NAN-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -758,8 +758,8 @@
 //
 // = Big-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mmicromips -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-MM %s
 // CHECK-BE-NAN64-MM: "-internal-isystem"
 // CHECK-BE-NAN64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -785,8 +785,8 @@
 //
 // = Big-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-N32 %s
 // CHECK-BE-HF-64-N32: "-internal-isystem"
 // CHECK-BE-HF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -812,8 +812,8 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-N32 %s
 // CHECK-BE-HF64-64-N32: "-internal-isystem"
 // CHECK-BE-HF64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -839,8 +839,8 @@
 //
 // = Big-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-N32 %s
 // CHECK-BE-SF-64-N32: "-internal-isystem"
 // CHECK-BE-SF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -866,8 +866,8 @@
 //
 // = Big-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-N32 %s
 // CHECK-BE-NAN-64-N32: "-internal-isystem"
 // CHECK-BE-NAN-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -893,8 +893,8 @@
 //
 // = Big-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-N32 %s
 // CHECK-BE-NAN64-64-N32: "-internal-isystem"
 // CHECK-BE-NAN64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -920,8 +920,8 @@
 //
 // = Big-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64-64 %s
 // CHECK-BE-HF-64-64: "-internal-isystem"
 // CHECK-BE-HF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -947,8 +947,8 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64-64 %s
 // CHECK-BE-HF64-64-64: "-internal-isystem"
 // CHECK-BE-HF64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -974,8 +974,8 @@
 //
 // = Big-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64-64 %s
 // CHECK-BE-SF-64-64: "-internal-isystem"
 // CHECK-BE-SF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1001,8 +1001,8 @@
 //
 // = Big-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64-64 %s
 // CHECK-BE-NAN-64-64: "-internal-isystem"
 // CHECK-BE-NAN-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1028,8 +1028,8 @@
 //
 // = Big-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64-64 %s
 // CHECK-BE-NAN64-64-64: "-internal-isystem"
 // CHECK-BE-NAN64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1055,8 +1055,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-N32 %s
 // CHECK-BE-HF-64R2-N32: "-internal-isystem"
 // CHECK-BE-HF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1082,8 +1082,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-N32 %s
 // CHECK-BE-HF64-64R2-N32: "-internal-isystem"
 // CHECK-BE-HF64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1109,8 +1109,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-N32 %s
 // CHECK-BE-SF-64R2-N32: "-internal-isystem"
 // CHECK-BE-SF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1136,8 +1136,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-N32 %s
 // CHECK-BE-NAN-64R2-N32: "-internal-isystem"
 // CHECK-BE-NAN-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1163,8 +1163,8 @@
 //
 // = Big-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-N32 %s
 // CHECK-BE-NAN64-64R2-N32: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1190,8 +1190,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R2-64 %s
 // CHECK-BE-HF-64R2-64: "-internal-isystem"
 // CHECK-BE-HF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1217,8 +1217,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF64-64R2-64 %s
 // CHECK-BE-HF64-64R2-64: "-internal-isystem"
 // CHECK-BE-HF64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1244,8 +1244,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-SF-64R2-64 %s
 // CHECK-BE-SF-64R2-64: "-internal-isystem"
 // CHECK-BE-SF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1271,8 +1271,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN-64R2-64 %s
 // CHECK-BE-NAN-64R2-64: "-internal-isystem"
 // CHECK-BE-NAN-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1298,8 +1298,8 @@
 //
 // = Big-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64 %s
 // CHECK-BE-NAN64-64R2-64: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1325,8 +1325,8 @@
 //
 // = Big-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-NAN64-64R2-64-DEF %s
 // CHECK-BE-NAN64-64R2-64-DEF: "-internal-isystem"
 // CHECK-BE-NAN64-64R2-64-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1352,8 +1352,8 @@
 //
 // = Little-endian, mips32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32 %s
 // CHECK-EL-HF-32: "-internal-isystem"
 // CHECK-EL-HF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1379,8 +1379,8 @@
 //
 // = Little-endian, mips32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32 %s
 // CHECK-EL-HF64-32: "-internal-isystem"
 // CHECK-EL-HF64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1406,8 +1406,8 @@
 //
 // = Little-endian, mips32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32 %s
 // CHECK-EL-SF-32: "-internal-isystem"
 // CHECK-EL-SF-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1433,8 +1433,8 @@
 //
 // = Little-endian, mips32 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16 %s
 // CHECK-EL-HF-16: "-internal-isystem"
 // CHECK-EL-HF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1460,8 +1460,8 @@
 //
 // = Little-endian, mips32 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16 %s
 // CHECK-EL-HF64-16: "-internal-isystem"
 // CHECK-EL-HF64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1487,8 +1487,8 @@
 //
 // = Little-endian, mips32 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16 %s
 // CHECK-EL-SF-16: "-internal-isystem"
 // CHECK-EL-SF-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1514,8 +1514,8 @@
 //
 // = Little-endian, mips32 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16 %s
 // CHECK-EL-NAN-16: "-internal-isystem"
 // CHECK-EL-NAN-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1541,8 +1541,8 @@
 //
 // = Little-endian, mips32 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16 %s
 // CHECK-EL-NAN64-16: "-internal-isystem"
 // CHECK-EL-NAN64-16: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1568,8 +1568,8 @@
 //
 // = Little-endian, mips32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32 %s
 // CHECK-EL-NAN-32: "-internal-isystem"
 // CHECK-EL-NAN-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1595,8 +1595,8 @@
 //
 // = Little-endian, mips32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32 %s
 // CHECK-EL-NAN64-32: "-internal-isystem"
 // CHECK-EL-NAN64-32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1622,8 +1622,8 @@
 //
 // = Little-endian, mips32r2, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-32R2 %s
 // CHECK-EL-HF-32R2: "-internal-isystem"
 // CHECK-EL-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1649,8 +1649,8 @@
 //
 // = Little-endian, mips32r2, hard float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mhard-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mhard-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-HF-32R2 %s
 // CHECK-EL-UC-HF-32R2: "-internal-isystem"
 // CHECK-EL-UC-HF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1676,8 +1676,8 @@
 //
 // = Little-endian, mips32r2, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-32R2 %s
 // CHECK-EL-HF64-32R2: "-internal-isystem"
 // CHECK-EL-HF64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1703,8 +1703,8 @@
 //
 // = Little-endian, mips32r2, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-32R2 %s
 // CHECK-EL-SF-32R2: "-internal-isystem"
 // CHECK-EL-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1730,8 +1730,8 @@
 //
 // = Little-endian, mips32r2, soft float, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -msoft-float -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -msoft-float -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-SF-32R2 %s
 // CHECK-EL-UC-SF-32R2: "-internal-isystem"
 // CHECK-EL-UC-SF-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1757,8 +1757,8 @@
 //
 // = Little-endian, mips32r2 / mips16, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-16R2 %s
 // CHECK-EL-HF-16R2: "-internal-isystem"
 // CHECK-EL-HF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1784,8 +1784,8 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-16R2 %s
 // CHECK-EL-HF64-16R2: "-internal-isystem"
 // CHECK-EL-HF64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1811,8 +1811,8 @@
 //
 // = Little-endian, mips32r2 / mips16, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-16R2 %s
 // CHECK-EL-SF-16R2: "-internal-isystem"
 // CHECK-EL-SF-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1838,8 +1838,8 @@
 //
 // = Little-endian, mips32r2 / mips16, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-16R2 %s
 // CHECK-EL-NAN-16R2: "-internal-isystem"
 // CHECK-EL-NAN-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1865,8 +1865,8 @@
 //
 // = Little-endian, mips32r2 / mips16, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mips16 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-16R2 %s
 // CHECK-EL-NAN64-16R2: "-internal-isystem"
 // CHECK-EL-NAN64-16R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1892,8 +1892,8 @@
 //
 // = Little-endian, mips32r2, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-32R2 %s
 // CHECK-EL-NAN-32R2: "-internal-isystem"
 // CHECK-EL-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1919,8 +1919,8 @@
 //
 // = Little-endian, mips32r2, nan2008, uclibc
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mnan=2008 -muclibc \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mnan=2008 -muclibc \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-UC-NAN-32R2 %s
 // CHECK-EL-UC-NAN-32R2: "-internal-isystem"
 // CHECK-EL-UC-NAN-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1946,8 +1946,8 @@
 //
 // = Little-endian, mips32r2, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mips32r2 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2 %s
 // CHECK-EL-NAN64-32R2: "-internal-isystem"
 // CHECK-EL-NAN64-32R2: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -1973,8 +1973,8 @@
 //
 // = Little-endian, default (mips32r2), fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-32R2-DEF %s
 // CHECK-EL-NAN64-32R2-DEF: "-internal-isystem"
 // CHECK-EL-NAN64-32R2-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2000,8 +2000,8 @@
 //
 // = Little-endian, micromips, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-MM %s
 // CHECK-EL-HF-MM: "-internal-isystem"
 // CHECK-EL-HF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2027,8 +2027,8 @@
 //
 // = Little-endian, micromips, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-MM %s
 // CHECK-EL-HF64-MM: "-internal-isystem"
 // CHECK-EL-HF64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2054,8 +2054,8 @@
 //
 // = Little-endian, micromips, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-MM %s
 // CHECK-EL-SF-MM: "-internal-isystem"
 // CHECK-EL-SF-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2081,8 +2081,8 @@
 //
 // = Little-endian, micromips, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-MM %s
 // CHECK-EL-NAN-MM: "-internal-isystem"
 // CHECK-EL-NAN-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2108,8 +2108,8 @@
 //
 // = Little-endian, micromips, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mipsel-linux-gnu -mmicromips -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mipsel-mti-linux-gnu -mmicromips -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-MM %s
 // CHECK-EL-NAN64-MM: "-internal-isystem"
 // CHECK-EL-NAN64-MM: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2135,8 +2135,8 @@
 //
 // = Little-endian, mips64, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-N32 %s
 // CHECK-EL-HF-64-N32: "-internal-isystem"
 // CHECK-EL-HF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2162,8 +2162,8 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-N32 %s
 // CHECK-EL-HF64-64-N32: "-internal-isystem"
 // CHECK-EL-HF64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2189,8 +2189,8 @@
 //
 // = Little-endian, mips64, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-N32 %s
 // CHECK-EL-SF-64-N32: "-internal-isystem"
 // CHECK-EL-SF-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2216,8 +2216,8 @@
 //
 // = Little-endian, mips64, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-N32 %s
 // CHECK-EL-NAN-64-N32: "-internal-isystem"
 // CHECK-EL-NAN-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2243,8 +2243,8 @@
 //
 // = Little-endian, mips64, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-N32 %s
 // CHECK-EL-NAN64-64-N32: "-internal-isystem"
 // CHECK-EL-NAN64-64-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2270,8 +2270,8 @@
 //
 // = Little-endian, mips64, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64-64 %s
 // CHECK-EL-HF-64-64: "-internal-isystem"
 // CHECK-EL-HF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2297,8 +2297,8 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64-64 %s
 // CHECK-EL-HF64-64-64: "-internal-isystem"
 // CHECK-EL-HF64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2324,8 +2324,8 @@
 //
 // = Little-endian, mips64, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64-64 %s
 // CHECK-EL-SF-64-64: "-internal-isystem"
 // CHECK-EL-SF-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2351,8 +2351,8 @@
 //
 // = Little-endian, mips64, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64-64 %s
 // CHECK-EL-NAN-64-64: "-internal-isystem"
 // CHECK-EL-NAN-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2378,8 +2378,8 @@
 //
 // = Little-endian, mips64, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64-64 %s
 // CHECK-EL-NAN64-64-64: "-internal-isystem"
 // CHECK-EL-NAN64-64-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2405,8 +2405,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-N32 %s
 // CHECK-EL-HF-64R2-N32: "-internal-isystem"
 // CHECK-EL-HF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2432,8 +2432,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-N32 %s
 // CHECK-EL-HF64-64R2-N32: "-internal-isystem"
 // CHECK-EL-HF64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2459,8 +2459,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-N32 %s
 // CHECK-EL-SF-64R2-N32: "-internal-isystem"
 // CHECK-EL-SF-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2486,8 +2486,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-N32 %s
 // CHECK-EL-NAN-64R2-N32: "-internal-isystem"
 // CHECK-EL-NAN-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2513,8 +2513,8 @@
 //
 // = Little-endian, mips64r2, ABI n32, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=n32 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-N32 %s
 // CHECK-EL-NAN64-64R2-N32: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2540,8 +2540,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF-64R2-64 %s
 // CHECK-EL-HF-64R2-64: "-internal-isystem"
 // CHECK-EL-HF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2567,8 +2567,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-HF64-64R2-64 %s
 // CHECK-EL-HF64-64R2-64: "-internal-isystem"
 // CHECK-EL-HF64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2594,8 +2594,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, soft float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -msoft-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -msoft-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-SF-64R2-64 %s
 // CHECK-EL-SF-64R2-64: "-internal-isystem"
 // CHECK-EL-SF-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2621,8 +2621,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN-64R2-64 %s
 // CHECK-EL-NAN-64R2-64: "-internal-isystem"
 // CHECK-EL-NAN-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2648,8 +2648,8 @@
 //
 // = Little-endian, mips64r2, ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mips64r2 -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64 %s
 // CHECK-EL-NAN64-64R2-64: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2675,8 +2675,8 @@
 //
 // = Little-endian, default (mips64r2), ABI 64, fp64, nan2008
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64el-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64el-mti-linux-gnu -mabi=64 -mfp64 -mnan=2008 \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-EL-NAN64-64R2-64-DEF %s
 // CHECK-EL-NAN64-64R2-64-DEF: "-internal-isystem"
 // CHECK-EL-NAN64-64R2-64-DEF: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2704,8 +2704,8 @@
 //
 // = Big-endian, mips32r3, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r3 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r3 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R3 %s
 // CHECK-BE-HF-32R3: "-internal-isystem"
 // CHECK-BE-HF-32R3: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2731,8 +2731,8 @@
 //
 // = Big-endian, mips32r5, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips-linux-gnu -mips32r5 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips-mti-linux-gnu -mips32r5 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-32R5 %s
 // CHECK-BE-HF-32R5: "-internal-isystem"
 // CHECK-BE-HF-32R5: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2758,8 +2758,8 @@
 //
 // = Big-endian, mips64r3, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r3 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r3 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R3-64 %s
 // CHECK-BE-HF-64R3-64: "-internal-isystem"
 // CHECK-BE-HF-64R3-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
@@ -2785,8 +2785,8 @@
 //
 // = Big-endian, mips64r5, ABI 64, hard float
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     --target=mips64-linux-gnu -mips64r5 -mabi=64 -mhard-float \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_fsf_tree \
+// RUN:     --target=mips64-mti-linux-gnu -mips64r5 -mabi=64 -mhard-float \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_fsf_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-HF-64R5-64 %s
 // CHECK-BE-HF-64R5-64: "-internal-isystem"
 // CHECK-BE-HF-64R5-64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.0]]/../../../../mips-mti-linux-gnu/include/c++/4.9.0"
diff --git a/test/Driver/mips-ias-Wa.s b/test/Driver/mips-ias-Wa.s
index 233d062d6deb2..bc65872d99c78 100644
--- a/test/Driver/mips-ias-Wa.s
+++ b/test/Driver/mips-ias-Wa.s
@@ -47,3 +47,94 @@
 // RUN:   FileCheck -check-prefix=MSOFT-FLOAT-BOTH-MHARD-FLOAT-FIRST %s
 // MSOFT-FLOAT-BOTH-MHARD-FLOAT-FIRST: -cc1as
 // MSOFT-FLOAT-BOTH-MHARD-FLOAT-FIRST: "-target-feature" "-soft-float" "-target-feature" "+soft-float"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips1 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS1 %s
+// MIPS1: -cc1as
+// MIPS1: "-target-feature" "+mips1"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips2 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS2 %s
+// MIPS2: -cc1as
+// MIPS2: "-target-feature" "+mips2"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips3 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS3 %s
+// MIPS3: -cc1as
+// MIPS3: "-target-feature" "+mips3"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips4 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS4 %s
+// MIPS4: -cc1as
+// MIPS4: "-target-feature" "+mips4"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips5 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS5 %s
+// MIPS5: -cc1as
+// MIPS5: "-target-feature" "+mips5"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips32 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS32 %s
+// MIPS32: -cc1as
+// MIPS32: "-target-feature" "+mips32"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips32r2 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS32R2 %s
+// MIPS32R2: -cc1as
+// MIPS32R2: "-target-feature" "+mips32r2"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips32r3 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS32R3 %s
+// MIPS32R3: -cc1as
+// MIPS32R3: "-target-feature" "+mips32r3"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips32r5 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS32R5 %s
+// MIPS32R5: -cc1as
+// MIPS32R5: "-target-feature" "+mips32r5"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips32r6 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS32R6 %s
+// MIPS32R6: -cc1as
+// MIPS32R6: "-target-feature" "+mips32r6"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64 %s
+// MIPS64: -cc1as
+// MIPS64: "-target-feature" "+mips64"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64r2 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64R2 %s
+// MIPS64R2: -cc1as
+// MIPS64R2: "-target-feature" "+mips64r2"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64r3 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64R3 %s
+// MIPS64R3: -cc1as
+// MIPS64R3: "-target-feature" "+mips64r3"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64r5 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64R5 %s
+// MIPS64R5: -cc1as
+// MIPS64R5: "-target-feature" "+mips64r5"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64r6 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64R6 %s
+// MIPS64R6: -cc1as
+// MIPS64R6: "-target-feature" "+mips64r6"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64r2,-mips4 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64R2-MIPS4 %s
+// MIPS64R2-MIPS4: -cc1as
+// MIPS64R2-MIPS4-NOT: "-target-feature" "+mips64r2"
+// MIPS64R2-MIPS4: "-target-feature" "+mips4"
+// MIPS64R2-MIPS4-NOT: "-target-feature" "+mips64r2"
+
+// RUN: %clang -target mips-linux-gnu -### -fintegrated-as -c %s -Wa,-mips64,-mips32,-mips32r2 2>&1 | \
+// RUN:   FileCheck -check-prefix=MIPS64-MIPS32-MIPS32R2 %s
+// MIPS64-MIPS32-MIPS32R2: -cc1as
+// MIPS64-MIPS32-MIPS32R2-NOT: "-target-feature" "+mips64"
+// MIPS64-MIPS32-MIPS32R2-NOT: "-target-feature" "+mips32"
+// MIPS64-MIPS32-MIPS32R2: "-target-feature" "+mips32r2"
+// MIPS64-MIPS32-MIPS32R2-NOT: "-target-feature" "+mips64"
+// MIPS64-MIPS32-MIPS32R2-NOT: "-target-feature" "+mips32"
diff --git a/test/Driver/mips-img-v2.cpp b/test/Driver/mips-img-v2.cpp
new file mode 100644
index 0000000000000..34cf3d726e2d0
--- /dev/null
+++ b/test/Driver/mips-img-v2.cpp
@@ -0,0 +1,337 @@
+// Check frontend and linker invocations on the IMG v2 MIPS toolchain.
+
+// -EB -mips32r6 -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-O32 %s
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-O32: "-internal-externc-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/lib/../usr/include"
+// EB-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-O32: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-O32: "[[TC]]/mips-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-O32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/mips-r6-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib"
+// EB-HARD-O32: "[[TC]]/mips-r6-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips64r6 -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N32 %s
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib32"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N32: "-internal-externc-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/lib32/../usr/include"
+// EB-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-N32: "-dynamic-linker" "/lib32/ld-linux-mipsn8.so.1"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EB-HARD-N32: "[[TC]]/mips-r6-hard/lib32{{/|\\\\}}crtbegin.o"
+// EB-HARD-N32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/mips-r6-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32"
+// EB-HARD-N32: "[[TC]]/mips-r6-hard/lib32{{/|\\\\}}crtend.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EB -mips64r6 -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips64r6 -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N64 %s
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-hard/lib64"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N64: "-internal-externc-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/lib64/../usr/include"
+// EB-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-hard"
+// EB-HARD-N64: "-dynamic-linker" "/lib64/ld-linux-mipsn8.so.1"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EB-HARD-N64: "[[TC]]/mips-r6-hard/lib64{{/|\\\\}}crtbegin.o"
+// EB-HARD-N64: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/mips-r6-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r6-hard/lib/../lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64"
+// EB-HARD-N64: "[[TC]]/mips-r6-hard/lib64{{/|\\\\}}crtend.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r6-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-O32 %s
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-O32: "-internal-externc-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../usr/include"
+// EL-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-O32: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-O32: "[[TC]]/mipsel-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-O32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/mipsel-r6-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib"
+// EL-HARD-O32: "[[TC]]/mipsel-r6-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips64r6 -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N32 %s
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N32: "-internal-externc-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib32/../usr/include"
+// EL-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-N32: "-dynamic-linker" "/lib32/ld-linux-mipsn8.so.1"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EL-HARD-N32: "[[TC]]/mipsel-r6-hard/lib32{{/|\\\\}}crtbegin.o"
+// EL-HARD-N32: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/mipsel-r6-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32"
+// EL-HARD-N32: "[[TC]]/mipsel-r6-hard/lib32{{/|\\\\}}crtend.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EL -mips64r6 -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips64r6 -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N64 %s
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N64: "-internal-externc-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/lib64/../usr/include"
+// EL-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-hard"
+// EL-HARD-N64: "-dynamic-linker" "/lib64/ld-linux-mipsn8.so.1"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EL-HARD-N64: "[[TC]]/mipsel-r6-hard/lib64{{/|\\\\}}crtbegin.o"
+// EL-HARD-N64: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/mipsel-r6-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/lib/../lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64"
+// EL-HARD-N64: "[[TC]]/mipsel-r6-hard/lib64{{/|\\\\}}crtend.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r6-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -msoft-float \
+// RUN:   | FileCheck --check-prefix=EB-SOFT %s
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mips-r6-soft/lib"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT: "-internal-externc-isystem"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/lib/../usr/include"
+// EB-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mips-r6-soft"
+// EB-SOFT: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT: "[[TC]]/mips-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mips-r6-soft/lib"
+// EB-SOFT: "-L[[TC]]/mips-r6-soft/lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r6-soft/lib/../lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib"
+// EB-SOFT: "[[TC]]/mips-r6-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -msoft-float \
+// RUN:   | FileCheck --check-prefix=EL-SOFT %s
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/mipsel-r6-soft/lib"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT: "-internal-externc-isystem"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/lib/../usr/include"
+// EL-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r6-soft"
+// EL-SOFT: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT: "[[TC]]/mipsel-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/mipsel-r6-soft/lib"
+// EL-SOFT: "-L[[TC]]/mipsel-r6-soft/lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r6-soft/lib/../lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib"
+// EL-SOFT: "[[TC]]/mipsel-r6-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -mhard-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -mhard-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EB-HARD-MICRO %s
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-internal-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-MICRO: "-internal-externc-isystem"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/lib/../usr/include"
+// EB-HARD-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromips-r6-hard"
+// EB-HARD-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-MICRO: "[[TC]]/micromips-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-L[[TC]]/micromips-r6-hard/lib"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-hard/lib/../lib"
+// EB-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib"
+// EB-HARD-MICRO: "[[TC]]/micromips-r6-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mips32r6 -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mips32r6 -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EB-SOFT-MICRO %s
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-internal-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT-MICRO: "-internal-externc-isystem"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/lib/../usr/include"
+// EB-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromips-r6-soft"
+// EB-SOFT-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT-MICRO: "[[TC]]/micromips-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-L[[TC]]/micromips-r6-soft/lib"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-soft/lib/../lib"
+// EB-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib"
+// EB-SOFT-MICRO: "[[TC]]/micromips-r6-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromips-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -mhard-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -mhard-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-HARD-MICRO %s
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-internal-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-MICRO: "-internal-externc-isystem"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/lib/../usr/include"
+// EL-HARD-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r6-hard"
+// EL-HARD-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-MICRO: "[[TC]]/micromipsel-r6-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-L[[TC]]/micromipsel-r6-hard/lib"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-hard/lib/../lib"
+// EL-HARD-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib"
+// EL-HARD-MICRO: "[[TC]]/micromipsel-r6-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mips32r6 -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-img-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_img_v2_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mips32r6 -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO %s
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.2]]/../../../../mips-img-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/mips-img-linux-gnu/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-img-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO: "-internal-externc-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/lib/../usr/include"
+// EL-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r6-soft"
+// EL-SOFT-MICRO: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r6-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../mips-img-linux-gnu/lib/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/micromipsel-r6-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-soft/lib/../lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r6-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r6-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
diff --git a/test/Driver/mips-img.cpp b/test/Driver/mips-img.cpp
index 389e0f741082e..9d8cfba13ef6d 100644
--- a/test/Driver/mips-img.cpp
+++ b/test/Driver/mips-img.cpp
@@ -3,7 +3,7 @@
 // = Big-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-img-linux-gnu -mips32r6 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-32R6 %s
 // CHECK-BE-32R6: "-internal-isystem"
 // CHECK-BE-32R6: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -30,7 +30,7 @@
 // = Little-endian, mips32r6
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips-img-linux-gnu -mips32r6 -EL \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-32R6 %s
 // CHECK-LE-32R6: "-internal-isystem"
 // CHECK-LE-32R6: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -57,7 +57,7 @@
 // = Big-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=n32 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N32 %s
 // CHECK-BE-64R6-N32: "-internal-isystem"
 // CHECK-BE-64R6-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -84,7 +84,7 @@
 // = Little-endian, mips64r6, N32
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=n32 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N32 %s
 // CHECK-LE-64R6-N32: "-internal-isystem"
 // CHECK-LE-64R6-N32: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -111,7 +111,7 @@
 // = Big-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -mabi=64 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-BE-64R6-N64 %s
 // CHECK-BE-64R6-N64: "-internal-isystem"
 // CHECK-BE-64R6-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
@@ -138,7 +138,7 @@
 // = Little-endian, mips64r6, N64
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     --target=mips64-img-linux-gnu -mips64r6 -EL -mabi=64 \
-// RUN:     --gcc-toolchain=%S/Inputs/mips_img_tree \
+// RUN:     -stdlib=libstdc++ --gcc-toolchain=%S/Inputs/mips_img_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-LE-64R6-N64 %s
 // CHECK-LE-64R6-N64: "-internal-isystem"
 // CHECK-LE-64R6-N64: "[[TC:[^"]+/lib/gcc/mips-img-linux-gnu/4.9.0]]/../../../../mips-img-linux-gnu/include/c++/4.9.0"
diff --git a/test/Driver/mips-mti.cpp b/test/Driver/mips-mti.cpp
new file mode 100644
index 0000000000000..147239cf3759e
--- /dev/null
+++ b/test/Driver/mips-mti.cpp
@@ -0,0 +1,449 @@
+// Check frontend and linker invocations on the MTI MIPS toolchain.
+
+// -EB -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-O32 %s
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib"
+// EB-HARD-O32: "-internal-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-O32: "-internal-externc-isystem"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/lib/../usr/include"
+// EB-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-O32: "-dynamic-linker" "/lib/ld.so.1"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-O32: "[[TC]]/mips-r2-hard/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-O32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/mips-r2-hard/lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib"
+// EB-HARD-O32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib"
+// EB-HARD-O32: "[[TC]]/mips-r2-hard/lib{{/|\\\\}}crtend.o"
+// EB-HARD-O32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N32 %s
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib32"
+// EB-HARD-N32: "-internal-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N32: "-internal-externc-isystem"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/lib32/../usr/include"
+// EB-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-N32: "-dynamic-linker" "/lib32/ld.so.1"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EB-HARD-N32: "[[TC]]/mips-r2-hard/lib32{{/|\\\\}}crtbegin.o"
+// EB-HARD-N32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/mips-r2-hard/lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib32"
+// EB-HARD-N32: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32"
+// EB-HARD-N32: "[[TC]]/mips-r2-hard/lib32{{/|\\\\}}crtend.o"
+// EB-HARD-N32: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-N64 %s
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard/lib64"
+// EB-HARD-N64: "-internal-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-N64: "-internal-externc-isystem"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/lib64/../usr/include"
+// EB-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard"
+// EB-HARD-N64: "-dynamic-linker" "/lib64/ld.so.1"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EB-HARD-N64: "[[TC]]/mips-r2-hard/lib64{{/|\\\\}}crtbegin.o"
+// EB-HARD-N64: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/mips-r2-hard/lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r2-hard/lib/../lib64"
+// EB-HARD-N64: "-L[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64"
+// EB-HARD-N64: "[[TC]]/mips-r2-hard/lib64{{/|\\\\}}crtend.o"
+// EB-HARD-N64: "[[TC]]/../../../../sysroot/mips-r2-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-O32 %s
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-internal-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-O32: "-internal-externc-isystem"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../usr/include"
+// EL-HARD-O32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-O32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-O32: "-dynamic-linker" "/lib/ld.so.1"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-O32: "[[TC]]/mipsel-r2-hard/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-O32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/mipsel-r2-hard/lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib"
+// EL-HARD-O32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib"
+// EL-HARD-O32: "[[TC]]/mipsel-r2-hard/lib{{/|\\\\}}crtend.o"
+// EL-HARD-O32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=n32
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=n32 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N32 %s
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-internal-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N32: "-internal-externc-isystem"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib32/../usr/include"
+// EL-HARD-N32: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N32: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-N32: "-dynamic-linker" "/lib32/ld.so.1"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crt1.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crti.o"
+// EL-HARD-N32: "[[TC]]/mipsel-r2-hard/lib32{{/|\\\\}}crtbegin.o"
+// EL-HARD-N32: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/mipsel-r2-hard/lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib32"
+// EL-HARD-N32: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32"
+// EL-HARD-N32: "[[TC]]/mipsel-r2-hard/lib32{{/|\\\\}}crtend.o"
+// EL-HARD-N32: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib32{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mabi=64
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips64-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mabi=64 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-N64 %s
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-internal-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-N64: "-internal-externc-isystem"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/lib64/../usr/include"
+// EL-HARD-N64: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-N64: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard"
+// EL-HARD-N64: "-dynamic-linker" "/lib64/ld.so.1"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crt1.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crti.o"
+// EL-HARD-N64: "[[TC]]/mipsel-r2-hard/lib64{{/|\\\\}}crtbegin.o"
+// EL-HARD-N64: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/mipsel-r2-hard/lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/lib/../lib64"
+// EL-HARD-N64: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64"
+// EL-HARD-N64: "[[TC]]/mipsel-r2-hard/lib64{{/|\\\\}}crtend.o"
+// EL-HARD-N64: "[[TC]]/../../../../sysroot/mipsel-r2-hard/usr/lib/../lib64{{/|\\\\}}crtn.o"
+
+// -EB -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -msoft-float \
+// RUN:   | FileCheck --check-prefix=EB-SOFT %s
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-soft/lib"
+// EB-SOFT: "-internal-isystem"
+// EB-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-SOFT: "-internal-externc-isystem"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/lib/../usr/include"
+// EB-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EB-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-soft"
+// EB-SOFT: "-dynamic-linker" "/lib/ld.so.1"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-SOFT: "[[TC]]/mips-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EB-SOFT: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-soft/lib"
+// EB-SOFT: "-L[[TC]]/mips-r2-soft/lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r2-soft/lib/../lib"
+// EB-SOFT: "-L[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib"
+// EB-SOFT: "[[TC]]/mips-r2-soft/lib{{/|\\\\}}crtend.o"
+// EB-SOFT: "[[TC]]/../../../../sysroot/mips-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -msoft-float
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -msoft-float \
+// RUN:   | FileCheck --check-prefix=EL-SOFT %s
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-soft/lib"
+// EL-SOFT: "-internal-isystem"
+// EL-SOFT: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT: "-internal-externc-isystem"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/lib/../usr/include"
+// EL-SOFT: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-soft"
+// EL-SOFT: "-dynamic-linker" "/lib/ld.so.1"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT: "[[TC]]/mipsel-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-soft/lib"
+// EL-SOFT: "-L[[TC]]/mipsel-r2-soft/lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r2-soft/lib/../lib"
+// EL-SOFT: "-L[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib"
+// EL-SOFT: "[[TC]]/mipsel-r2-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT: "[[TC]]/../../../../sysroot/mipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -muclibc
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -muclibc \
+// RUN:   | FileCheck --check-prefix=EB-HARD-UCLIBC %s
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-internal-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-UCLIBC: "-internal-externc-isystem"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/lib/../usr/include"
+// EB-HARD-UCLIBC: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-UCLIBC: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-uclibc"
+// EB-HARD-UCLIBC: "-dynamic-linker" "/lib/ld-uClibc.so.0"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-UCLIBC: "[[TC]]/mips-r2-hard-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/mips-r2-hard-uclibc/lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/lib/../lib"
+// EB-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib"
+// EB-HARD-UCLIBC: "[[TC]]/mips-r2-hard-uclibc/lib{{/|\\\\}}crtend.o"
+// EB-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mips-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -muclibc
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -muclibc \
+// RUN:   | FileCheck --check-prefix=EL-HARD-UCLIBC %s
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-internal-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-UCLIBC: "-internal-externc-isystem"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/lib/../usr/include"
+// EL-HARD-UCLIBC: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-UCLIBC: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc"
+// EL-HARD-UCLIBC: "-dynamic-linker" "/lib/ld-uClibc.so.0"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-UCLIBC: "[[TC]]/mipsel-r2-hard-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/mipsel-r2-hard-uclibc/lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/lib/../lib"
+// EL-HARD-UCLIBC: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib"
+// EL-HARD-UCLIBC: "[[TC]]/mipsel-r2-hard-uclibc/lib{{/|\\\\}}crtend.o"
+// EL-HARD-UCLIBC: "[[TC]]/../../../../sysroot/mipsel-r2-hard-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-NAN2008 %s
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-internal-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-NAN2008: "-internal-externc-isystem"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/lib/../usr/include"
+// EB-HARD-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-nan2008"
+// EB-HARD-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-NAN2008: "[[TC]]/mips-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-L[[TC]]/mips-r2-hard-nan2008/lib"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/lib/../lib"
+// EB-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib"
+// EB-HARD-NAN2008: "[[TC]]/mips-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EB-HARD-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-NAN2008 %s
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-internal-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-NAN2008: "-internal-externc-isystem"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/lib/../usr/include"
+// EL-HARD-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008"
+// EL-HARD-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-L[[TC]]/mipsel-r2-hard-nan2008/lib"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/lib/../lib"
+// EL-HARD-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib"
+// EL-HARD-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EL-HARD-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EB -mhard-float -muclibc -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EB -mhard-float -muclibc -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EB-HARD-UCLIBC-NAN2008 %s
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EB-HARD-UCLIBC-NAN2008: "-internal-externc-isystem"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/lib/../usr/include"
+// EB-HARD-UCLIBC-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EB-HARD-UCLIBC-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc"
+// EB-HARD-UCLIBC-NAN2008: "-dynamic-linker" "/lib/ld-uClibc-mipsn8.so.0"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/mips-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/mips-r2-hard-nan2008-uclibc/lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/lib/../lib"
+// EB-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/mips-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtend.o"
+// EB-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mips-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -muclibc -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -muclibc -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-HARD-UCLIBC-NAN2008 %s
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-internal-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-HARD-UCLIBC-NAN2008: "-internal-externc-isystem"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/lib/../usr/include"
+// EL-HARD-UCLIBC-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-HARD-UCLIBC-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc"
+// EL-HARD-UCLIBC-NAN2008: "-dynamic-linker" "/lib/ld-uClibc-mipsn8.so.0"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtbegin.o"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/lib/../lib"
+// EL-HARD-UCLIBC-NAN2008: "-L[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/mipsel-r2-hard-nan2008-uclibc/lib{{/|\\\\}}crtend.o"
+// EL-HARD-UCLIBC-NAN2008: "[[TC]]/../../../../sysroot/mipsel-r2-hard-nan2008-uclibc/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -msoft-float -mmicromips
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -msoft-float -mmicromips \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO %s
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-internal-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO: "-internal-externc-isystem"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/lib/../usr/include"
+// EL-SOFT-MICRO: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r2-soft"
+// EL-SOFT-MICRO: "-dynamic-linker" "/lib/ld.so.1"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r2-soft/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/micromipsel-r2-soft/lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r2-soft/lib/../lib"
+// EL-SOFT-MICRO: "-L[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib"
+// EL-SOFT-MICRO: "[[TC]]/micromipsel-r2-soft/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO: "[[TC]]/../../../../sysroot/micromipsel-r2-soft/usr/lib/../lib{{/|\\\\}}crtn.o"
+
+// -EL -mhard-float -mmicromips -mnan=2008
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:        --target=mips-mti-linux-gnu \
+// RUN:        --gcc-toolchain=%S/Inputs/mips_mti_tree \
+// RUN:        -stdlib=libstdc++ \
+// RUN:        -EL -mhard-float -mmicromips -mnan=2008 \
+// RUN:   | FileCheck --check-prefix=EL-SOFT-MICRO-NAN2008 %s
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC:[^"]+/lib/gcc/mips-mti-linux-gnu/4.9.2]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2"
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/mips-mti-linux-gnu/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-internal-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../mips-mti-linux-gnu/include/c++/4.9.2/backward"
+// EL-SOFT-MICRO-NAN2008: "-internal-externc-isystem"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/lib/../usr/include"
+// EL-SOFT-MICRO-NAN2008: "{{.*}}ld{{(.exe)?}}"
+// EL-SOFT-MICRO-NAN2008: "--sysroot=[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008"
+// EL-SOFT-MICRO-NAN2008: "-dynamic-linker" "/lib/ld-linux-mipsn8.so.1"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crt1.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crti.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/micromipsel-r2-hard-nan2008/lib{{/|\\\\}}crtbegin.o"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../mips-mti-linux-gnu/lib/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/micromipsel-r2-hard-nan2008/lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/lib/../lib"
+// EL-SOFT-MICRO-NAN2008: "-L[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/micromipsel-r2-hard-nan2008/lib{{/|\\\\}}crtend.o"
+// EL-SOFT-MICRO-NAN2008: "[[TC]]/../../../../sysroot/micromipsel-r2-hard-nan2008/usr/lib/../lib{{/|\\\\}}crtn.o"
diff --git a/test/Driver/msc-version.c b/test/Driver/msc-version.c
index 18fe731eeba8e..924633508bcd3 100644
--- a/test/Driver/msc-version.c
+++ b/test/Driver/msc-version.c
@@ -1,15 +1,4 @@
 //
-// Verify defaults
-//
-
-// RUN: %clang -target i686-windows -fms-compatibility -dM -E - </dev/null -o - | FileCheck %s -check-prefix CHECK-NO-MSC-VERSION
-
-// CHECK-NO-MSC-VERSION: _MSC_BUILD 1
-// CHECK-NO-MSC-VERSION: _MSC_FULL_VER 180000000
-// CHECK-NO-MSC-VERSION: _MSC_VER 1800
-
-
-//
 // Verify -fms-compatibility-version parsing
 //
 
diff --git a/test/Driver/msvc-compiler-rt.c b/test/Driver/msvc-compiler-rt.c
new file mode 100644
index 0000000000000..abbca507ae7f4
--- /dev/null
+++ b/test/Driver/msvc-compiler-rt.c
@@ -0,0 +1,5 @@
+// RUN: %clang -target x86_64-pc-windows-msvc --rtlib=compiler-rt -### %s 2>&1 | FileCheck %s -check-prefix MSVC-COMPILER-RT
+// RUN: not %clang %s -target x86_64-pc-windows-msvc --rtlib=libgcc 2>&1 | FileCheck %s -check-prefix CHECK-ERROR
+
+// MSVC-COMPILER-RT: "{{.*}}clang_rt.builtins{{.*}}"
+// CHECK-ERROR: unsupported runtime library 'libgcc' for platform 'MSVC'
diff --git a/test/Driver/msvc-link.c b/test/Driver/msvc-link.c
index b44e3826199ca..8fe57331ee478 100644
--- a/test/Driver/msvc-link.c
+++ b/test/Driver/msvc-link.c
@@ -10,3 +10,9 @@
 // DLL: "-defaultlib:libcmt"
 // DLL: "-nologo"
 // DLL: "-dll"
+
+// RUN: %clang -target i686-pc-windows-msvc -L/var/empty -L/usr/lib -### %s 2>&1 | FileCheck --check-prefix LIBPATH %s
+// LIBPATH: "-libpath:/var/empty"
+// LIBPATH: "-libpath:/usr/lib"
+// LIBPATH: "-nologo"
+
diff --git a/test/Driver/msvc-triple.c b/test/Driver/msvc-triple.c
index f181b3199a3f3..cb0c338f84155 100644
--- a/test/Driver/msvc-triple.c
+++ b/test/Driver/msvc-triple.c
@@ -1,9 +1,7 @@
-// RUN: %clang -target i686-pc-windows-msvc   -S -emit-llvm %s -o - | FileCheck %s --check-prefix=DEFAULT
 // RUN: %clang -target i686-pc-windows-msvc19 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=TARGET-19
 // RUN: %clang -target i686-pc-windows-msvc   -S -emit-llvm %s -o - -fms-compatibility-version=19 | FileCheck %s --check-prefix=OVERRIDE-19
 // RUN: %clang -target i686-pc-windows-msvc-elf -S -emit-llvm %s -o - | FileCheck %s --check-prefix=ELF-DEFAULT
 
-// DEFAULT:     target triple = "i686-pc-windows-msvc18.0.0"
 // TARGET-19:   target triple = "i686-pc-windows-msvc19.0.0"
 // OVERRIDE-19: target triple = "i686-pc-windows-msvc19.0.0"
-// ELF-DEFAULT: target triple = "i686-pc-windows-msvc18.0.0-elf"
+// ELF-DEFAULT: target triple = "i686-pc-windows-msvc{{.*}}-elf"
diff --git a/test/Driver/myriad-toolchain.c b/test/Driver/myriad-toolchain.c
index 6c94cff6801c9..3e580ac77e313 100644
--- a/test/Driver/myriad-toolchain.c
+++ b/test/Driver/myriad-toolchain.c
@@ -36,10 +36,10 @@
 // As such, we test only for a trailing quote in its rendering.
 // The same goes for "moviAsm".
 
-// RUN: %clang -target shave-myriad -c -### %s -isystem somewhere -Icommon -Wa,-yippee 2>&1 \
+// RUN: %clang -target shave-myriad -mcpu=myriad2.2 -c -### %s -isystem somewhere -Icommon -Wa,-yippee 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=MOVICOMPILE
-// MOVICOMPILE: moviCompile{{(.exe)?}}" "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-isystem" "somewhere" "-I" "common"
-// MOVICOMPILE: moviAsm{{(.exe)?}}" "-no6thSlotCompression" "-cv:myriad2" "-noSPrefixing" "-a"
+// MOVICOMPILE: moviCompile{{(.exe)?}}" "-S" "-fno-exceptions" "-DMYRIAD2" "-mcpu=myriad2.2" "-isystem" "somewhere" "-I" "common"
+// MOVICOMPILE: moviAsm{{(.exe)?}}" "-no6thSlotCompression" "-cv:myriad2.2" "-noSPrefixing" "-a"
 // MOVICOMPILE: "-yippee" "-i:somewhere" "-i:common" "-elf"
 
 // RUN: %clang -target shave-myriad -c -### %s -DEFINE_ME -UNDEFINE_ME 2>&1 \
@@ -58,15 +58,15 @@
 
 // RUN: %clang -target shave-myriad -c %s -o foo.o -### -MD -MF dep.d 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=MDMF
-// MDMF: "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-MD" "-MF" "dep.d" "-MT" "foo.o"
+// MDMF: "-S" "-fno-exceptions" "-DMYRIAD2" "-MD" "-MF" "dep.d" "-MT" "foo.o"
 
-// RUN: %clang -target shave-myriad -std=gnu++11 -S %s -o foo.o -### 2>&1 \
+// RUN: %clang -target shave-myriad -std=gnu++11 -mcpu=anothercpu -S %s -o foo.o -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=STDEQ
-// STDEQ: "-S" "-fno-exceptions" "-mcpu=myriad2" "-DMYRIAD2" "-std=gnu++11"
+// STDEQ: "-S" "-fno-exceptions" "-DMYRIAD2" "-std=gnu++11" "-mcpu=anothercpu"
 
 // RUN: %clang -target shave-myriad -E -Ifoo %s -o foo.i -### 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=PREPROCESS
-// PREPROCESS: "-E" "-mcpu=myriad2" "-DMYRIAD2" "-I" "foo"
+// PREPROCESS: "-E" "-DMYRIAD2" "-I" "foo"
 
 // RUN: %clang -target sparc-myriad -### --driver-mode=g++ %s 2>&1 | FileCheck %s --check-prefix=STDLIBCXX
 // STDLIBCXX: "-lstdc++" "-lc" "-lgcc"
@@ -77,3 +77,7 @@
 
 // RUN: %clang -### -c -g %s -target sparc-myriad 2>&1 | FileCheck -check-prefix=G_SPARC %s
 // G_SPARC: "-debug-info-kind=limited" "-dwarf-version=2"
+
+// RUN: %clang -### -c %s -target sparc-myriad-elf -fuse-init-array 2>&1 \
+// RUN: | FileCheck -check-prefix=USE-INIT-ARRAY %s
+// USE-INIT-ARRAY-NOT: argument unused
diff --git a/test/Driver/netbsd.c b/test/Driver/netbsd.c
index 351fbdf9ee9c1..1a87d8e1a6a99 100644
--- a/test/Driver/netbsd.c
+++ b/test/Driver/netbsd.c
@@ -1,5 +1,15 @@
 // RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=STATIC %s
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: -pie --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=PIE %s
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: -shared --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SHARED %s
+
+// RUN: %clang -no-canonical-prefixes -target x86_64--netbsd \
+// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64 %s
 // RUN: %clang -no-canonical-prefixes -target x86_64--netbsd7.0.0 \
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
@@ -105,6 +115,32 @@
 // RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC64 %s
 
+// STATIC: ld{{.*}}" "--eh-frame-hdr"
+// STATIC-NOT: "-pie"
+// STATIC-NOT: "-Bshareable"
+// STATIC: "-dynamic-linker" "/libexec/ld.elf_so"
+// STATIC-NOT: "-pie"
+// STATIC-NOT: "-Bshareable"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crti.o" "{{.*}}/usr/lib{{/|\\\\}}crtbegin.o"
+// STATIC: "{{.*}}/usr/lib{{/|\\\\}}crtend.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// SHARED: ld{{.*}}" "--eh-frame-hdr"
+// SHARED-NOT: "-pie"
+// SHARED-NOT: "-dynamic-linker"
+// SHARED-NOT: "{{.*}}/usr/lib{{/|\\\\}}crt0.o"
+// SHARED: "{{.*}}/usr/lib{{/|\\\\}}crti.o" "{{.*}}/usr/lib{{/|\\\\}}crtbeginS.o"
+// SHARED: "{{.*}}/usr/lib{{/|\\\\}}crtendS.o" "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
+// PIE: ld{{.*}}" "--eh-frame-hdr"
+// PIE-NOT: "-Bshareable"
+// PIE "-pie" "-dynamic-linker" "/libexec/ld.elf_so"
+// PIE-NOT: "-Bshareable"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtbeginS.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtendS.o"
+// PIE: "{{.*}}/usr/lib{{/|\\\\}}crtn.o"
+
 // X86_64: clang{{.*}}" "-cc1" "-triple" "x86_64--netbsd"
 // X86_64: ld{{.*}}" "--eh-frame-hdr" "-dynamic-linker" "/libexec/ld.elf_so"
 // X86_64: "-o" "a.out" "{{.*}}/usr/lib{{/|\\\\}}crt0.o" "{{.*}}/usr/lib{{/|\\\\}}crti.o"
diff --git a/test/Driver/netbsd.cpp b/test/Driver/netbsd.cpp
index 856b6ccc669df..104d03eba1911 100644
--- a/test/Driver/netbsd.cpp
+++ b/test/Driver/netbsd.cpp
@@ -1,93 +1,93 @@
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd6.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=X86_64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd6.0.0-eabi \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=ARM %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd7.0.0-eabi \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=ARM-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=AARCH64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd6.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC64 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd6.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd7.0.0 \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=POWERPC %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc64--netbsd \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=POWERPC64 %s
 
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target x86_64--netbsd6.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-X86_64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd6.0.0-eabi -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM %s
 // RUN: %clangxx -no-canonical-prefixes -target arm--netbsd7.0.0-eabi -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-ARM-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64 %s
 // RUN: %clangxx -no-canonical-prefixes -target aarch64--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-AARCH64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd6.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC64 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd6.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC64-6 %s
 // RUN: %clangxx -no-canonical-prefixes -target sparc64--netbsd7.0.0 -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-SPARC64-7 %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC %s
 // RUN: %clangxx -no-canonical-prefixes -target powerpc64--netbsd -static \
-// RUN: --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: -stdlib=platform --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=S-POWERPC64 %s
 
 // X86_64: clang{{.*}}" "-cc1" "-triple" "x86_64--netbsd"
diff --git a/test/Driver/noinline.c b/test/Driver/noinline.c
index e665b2f0aa247..70f950cf52970 100644
--- a/test/Driver/noinline.c
+++ b/test/Driver/noinline.c
@@ -3,7 +3,7 @@
 
 // RUN: %clang -target x86_64-apple-darwin10 \
 // RUN:   -fno-inline -fno-inline-functions -### -fsyntax-only %s 2> %t
-// RUN: FileCheck --check-prefix=CHECK < %t %s
+// RUN: FileCheck < %t %s
 
 // CHECK: clang
 // CHECK: "-fno-inline"
diff --git a/test/Driver/nostdlib.c b/test/Driver/nostdlib.c
index 47c6f8bacd464..6e7bc0eb93ec9 100644
--- a/test/Driver/nostdlib.c
+++ b/test/Driver/nostdlib.c
@@ -22,6 +22,10 @@
 // RUN:     -resource-dir=%S/Inputs/resource_dir -lclang_rt.builtins-i686 \
 // RUN:   | FileCheck --check-prefix=CHECK-LINUX-NOSTDLIB %s
 //
+// RUN: %clang -target x86_64-pc-windows-msvc -nostdlib --rtlib=compiler-rt -### %s 2>&1 | FileCheck %s -check-prefix CHECK-MSVC-NOSTDLIB
+// RUN: %clang -target x86_64-pc-windows-msvc --rtlib=compiler-rt -nostdlib -### %s 2>&1 | FileCheck %s -check-prefix CHECK-MSVC-NOSTDLIB
+//
 // CHECK-LINUX-NOSTDLIB: warning: argument unused during compilation: '--rtlib=compiler-rt'
 // CHECK-LINUX-NOSTDLIB: "{{(.*[^.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
 // CHECK-LINUX-NOSTDLIB-NOT: "{{.*}}/Inputs/resource_dir{{/|\\\\}}lib{{/|\\\\}}linux{{/|\\\\}}libclang_rt.builtins-i686.a"
+// CHECK-MSVC-NOSTDLIB: warning: argument unused during compilation: '--rtlib=compiler-rt'
diff --git a/test/Driver/nozlibcompress.c b/test/Driver/nozlibcompress.c
index 4eac066165555..9986c85d79aea 100644
--- a/test/Driver/nozlibcompress.c
+++ b/test/Driver/nozlibcompress.c
@@ -1,5 +1,5 @@
 // RUN: %clang -c %s -Wa,--compress-debug-sections 2>&1 | FileCheck %s
-// RUN: %clang -c %s -Wa,--compress-debug-sections -Wa,--nocompress-debug-sections 2>&1 | FileCheck --check-prefix=NOWARN %s
+// RUN: %clang -c %s -Wa,--compress-debug-sections -Wa,--nocompress-debug-sections 2>&1 | FileCheck --allow-empty --check-prefix=NOWARN %s
 // REQUIRES: nozlib
 
 // CHECK: warning: cannot compress debug sections (zlib not installed)
diff --git a/test/Driver/objc-weak.m b/test/Driver/objc-weak.m
index ff60759f2c9ca..68ae26e1bcbcf 100644
--- a/test/Driver/objc-weak.m
+++ b/test/Driver/objc-weak.m
@@ -10,9 +10,9 @@
 // ARC-NO-WEAK: -fobjc-arc
 // ARC-NO-WEAK: -fno-objc-weak
 
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-arc -fobjc-weak 2>&1 | FileCheck %s --check-prefix ARC-WEAK-UNSUPPORTED
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak -fobjc-arc  2>&1 | FileCheck %s --check-prefix ARC-WEAK-UNSUPPORTED
-// ARC-WEAK-UNSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-arc -fobjc-weak 2>&1 | FileCheck %s --check-prefix ARC-WEAK-NOTSUPPORTED
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak -fobjc-arc  2>&1 | FileCheck %s --check-prefix ARC-WEAK-NOTSUPPORTED
+// ARC-WEAK-NOTSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
 
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK
@@ -22,6 +22,6 @@
 // RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.7 -S -### %s -fobjc-weak -fno-objc-weak 2>&1 | FileCheck %s --check-prefix MRC-NO-WEAK
 // MRC-NO-WEAK: -fno-objc-weak
 
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-UNSUPPORTED
-// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-UNSUPPORTED
-// MRC-WEAK-UNSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-NOTSUPPORTED
+// RUN: %clang -target x86_64-apple-macosx -mmacosx-version-min=10.5 -S -### %s -fno-objc-weak -fobjc-weak 2>&1 | FileCheck %s --check-prefix MRC-WEAK-NOTSUPPORTED
+// MRC-WEAK-NOTSUPPORTED: error: -fobjc-weak is not supported on the current deployment target
diff --git a/test/Driver/opencl.cl b/test/Driver/opencl.cl
new file mode 100644
index 0000000000000..b2656c2c501d6
--- /dev/null
+++ b/test/Driver/opencl.cl
@@ -0,0 +1,35 @@
+// RUN: %clang -S -### -cl-std=CL %s 2>&1 | FileCheck --check-prefix=CHECK-CL %s
+// RUN: %clang -S -### -cl-std=CL1.1 %s 2>&1 | FileCheck --check-prefix=CHECK-CL11 %s
+// RUN: %clang -S -### -cl-std=CL1.2 %s 2>&1 | FileCheck --check-prefix=CHECK-CL12 %s
+// RUN: %clang -S -### -cl-std=CL2.0 %s 2>&1 | FileCheck --check-prefix=CHECK-CL20 %s
+// RUN: %clang -S -### -cl-opt-disable %s 2>&1 | FileCheck --check-prefix=CHECK-OPT-DISABLE %s
+// RUN: %clang -S -### -cl-strict-aliasing %s 2>&1 | FileCheck --check-prefix=CHECK-STRICT-ALIASING %s
+// RUN: %clang -S -### -cl-single-precision-constant %s 2>&1 | FileCheck --check-prefix=CHECK-SINGLE-PRECISION-CONST %s
+// RUN: %clang -S -### -cl-finite-math-only %s 2>&1 | FileCheck --check-prefix=CHECK-FINITE-MATH-ONLY %s
+// RUN: %clang -S -### -cl-kernel-arg-info %s 2>&1 | FileCheck --check-prefix=CHECK-KERNEL-ARG-INFO %s
+// RUN: %clang -S -### -cl-unsafe-math-optimizations %s 2>&1 | FileCheck --check-prefix=CHECK-UNSAFE-MATH-OPT %s
+// RUN: %clang -S -### -cl-fast-relaxed-math %s 2>&1 | FileCheck --check-prefix=CHECK-FAST-RELAXED-MATH %s
+// RUN: %clang -S -### -cl-mad-enable %s 2>&1 | FileCheck --check-prefix=CHECK-MAD-ENABLE %s
+// RUN: %clang -S -### -cl-no-signed-zeros %s 2>&1 | FileCheck --check-prefix=CHECK-NO-SIGNED-ZEROS %s
+// RUN: %clang -S -### -cl-denorms-are-zero %s 2>&1 | FileCheck --check-prefix=CHECK-DENORMS-ARE-ZERO %s
+// RUN: not %clang -cl-std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
+// RUN: not %clang -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
+
+// CHECK-CL: "-cc1" {{.*}} "-cl-std=CL"
+// CHECK-CL11: "-cc1" {{.*}} "-cl-std=CL1.1"
+// CHECK-CL12: "-cc1" {{.*}} "-cl-std=CL1.2"
+// CHECK-CL20: "-cc1" {{.*}} "-cl-std=CL2.0"
+// CHECK-OPT-DISABLE: "-cc1" {{.*}} "-cl-opt-disable"
+// CHECK-STRICT-ALIASING: "-cc1" {{.*}} "-cl-strict-aliasing"
+// CHECK-SINGLE-PRECISION-CONST: "-cc1" {{.*}} "-cl-single-precision-constant"
+// CHECK-FINITE-MATH-ONLY: "-cc1" {{.*}} "-cl-finite-math-only"
+// CHECK-KERNEL-ARG-INFO: "-cc1" {{.*}} "-cl-kernel-arg-info"
+// CHECK-UNSAFE-MATH-OPT: "-cc1" {{.*}} "-cl-unsafe-math-optimizations"
+// CHECK-FAST-RELAXED-MATH: "-cc1" {{.*}} "-cl-fast-relaxed-math"
+// CHECK-MAD-ENABLE: "-cc1" {{.*}} "-cl-mad-enable"
+// CHECK-NO-SIGNED-ZEROS: "-cc1" {{.*}} "-cl-no-signed-zeros"
+// CHECK-DENORMS-ARE-ZERO: "-cc1" {{.*}} "-cl-denorms-are-zero"
+// CHECK-C99: error: invalid value 'c99' in '-cl-std=c99'
+// CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
+
+kernel void func(void);
diff --git a/test/Driver/output-file-cleanup.c b/test/Driver/output-file-cleanup.c
index 065df8f5b87be..314af4d6d37ef 100644
--- a/test/Driver/output-file-cleanup.c
+++ b/test/Driver/output-file-cleanup.c
@@ -1,3 +1,5 @@
+// RUN: rm -f "%t.d" "%t1.s" "%t2.s" "%t3.s" "%t4.s" "%t5.s"
+//
 // RUN: touch %t.s
 // RUN: not %clang -S -DCRASH -o %t.s -MMD -MF %t.d %s
 // RUN: test ! -f %t.s
@@ -36,6 +38,9 @@ invalid C code
 // RUN: test -f %t1.s
 // RUN: test ! -f %t2.s
 
+// When given multiple .c files to compile, clang compiles them in order until
+// it hits an error, at which point it stops.
+//
 // RUN: touch %t1.c
 // RUN: echo "invalid C code" > %t2.c
 // RUN: touch %t3.c
@@ -44,6 +49,6 @@ invalid C code
 // RUN: cd %T && not %clang -S %t1.c %t2.c %t3.c %t4.c %t5.c
 // RUN: test -f %t1.s
 // RUN: test ! -f %t2.s
-// RUN: test -f %t3.s
+// RUN: test ! -f %t3.s
 // RUN: test ! -f %t4.s
-// RUN: test -f %t5.s
+// RUN: test ! -f %t5.s
diff --git a/test/Driver/pic.c b/test/Driver/pic.c
index aeb2ee33114c9..9f9d09c54cf08 100644
--- a/test/Driver/pic.c
+++ b/test/Driver/pic.c
@@ -3,24 +3,26 @@
 //
 // CHECK-NO-PIC: "-mrelocation-model" "static"
 // CHECK-NO-PIC-NOT: "-pic-level"
-// CHECK-NO-PIC-NOT: "-pie-level"
+// CHECK-NO-PIC-NOT: "-pic-is-pie"
 //
 // CHECK-PIC1: "-mrelocation-model" "pic"
 // CHECK-PIC1: "-pic-level" "1"
+// CHECK-PIC1-NOT: "-pic-is-pie"
 //
 // CHECK-PIC2: "-mrelocation-model" "pic"
 // CHECK-PIC2: "-pic-level" "2"
+// CHECK-PIC2-NOT: "-pic-is-pie"
 //
 // CHECK-STATIC: "-static"
 // CHECK-NO-STATIC-NOT: "-static"
 //
 // CHECK-PIE1: "-mrelocation-model" "pic"
 // CHECK-PIE1: "-pic-level" "1"
-// CHECK-PIE1: "-pie-level" "1"
+// CHECK-PIE1: "-pic-is-pie"
 //
 // CHECK-PIE2: "-mrelocation-model" "pic"
 // CHECK-PIE2: "-pic-level" "2"
-// CHECK-PIE2: "-pie-level" "2"
+// CHECK-PIE2: "-pic-is-pie"
 //
 // CHECK-PIE-LD: "{{.*}}ld{{(.exe)?}}"
 // CHECK-PIE-LD: "-pie"
@@ -31,11 +33,11 @@
 //
 // CHECK-DYNAMIC-NO-PIC-32: "-mrelocation-model" "dynamic-no-pic"
 // CHECK-DYNAMIC-NO-PIC-32-NOT: "-pic-level"
-// CHECK-DYNAMIC-NO-PIC-32-NOT: "-pie-level"
+// CHECK-DYNAMIC-NO-PIC-32-NOT: "-pic-is-pie"
 //
 // CHECK-DYNAMIC-NO-PIC-64: "-mrelocation-model" "dynamic-no-pic"
 // CHECK-DYNAMIC-NO-PIC-64: "-pic-level" "2"
-// CHECK-DYNAMIC-NO-PIC-64-NOT: "-pie-level"
+// CHECK-DYNAMIC-NO-PIC-64-NOT: "-pic-is-pie"
 //
 // CHECK-NON-DARWIN-DYNAMIC-NO-PIC: error: unsupported option '-mdynamic-no-pic' for target 'i386-unknown-unknown'
 //
@@ -151,10 +153,9 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-NO-PIE
 //
 // Darwin is a beautiful and unique snowflake when it comes to these flags.
-// When targeting a 32-bit darwin system, the -fno-* flag variants work and
-// disable PIC, but any other flag enables PIC (*not* PIE) even if the flag
-// specifies PIE. On 64-bit targets, there is simply nothing you can do, there
-// is no PIE, there is only PIC when it comes to compilation.
+// When targeting a 32-bit darwin system, only level 2 is supported. On 64-bit
+// targets, there is simply nothing you can do, there is no PIE, there is only
+// PIC when it comes to compilation.
 // RUN: %clang -c %s -target i386-apple-darwin -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fpic -### 2>&1 \
@@ -162,9 +163,9 @@
 // RUN: %clang -c %s -target i386-apple-darwin -fPIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fpie -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target i386-apple-darwin -fPIE -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-NO-PIC
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIE -### 2>&1 \
@@ -172,7 +173,7 @@
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -fpic -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target i386-apple-darwin -fno-PIC -fPIE -### 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
+// RUN:   | FileCheck %s --check-prefix=CHECK-PIE2
 // RUN: %clang -c %s -target x86_64-apple-darwin -fno-PIC -### 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-PIC2
 // RUN: %clang -c %s -target x86_64-apple-darwin -fno-PIE -### 2>&1 \
diff --git a/test/Driver/ps4-header-search.c b/test/Driver/ps4-header-search.c
index 15e093f60e67c..3afef698d2642 100644
--- a/test/Driver/ps4-header-search.c
+++ b/test/Driver/ps4-header-search.c
@@ -1,6 +1,6 @@
 // REQUIRES: x86-registered-target
 
-// RUN: env SCE_PS4_SDK_DIR=%S/Inputs/scei-ps4_tree %clang -target x86_64-scei-ps4 -E -v %s 2>&1 | FileCheck %s --check-prefix=ENVPS4
+// RUN: env SCE_ORBIS_SDK_DIR=%S/Inputs/scei-ps4_tree %clang -target x86_64-scei-ps4 -E -v %s 2>&1 | FileCheck %s --check-prefix=ENVPS4
 // ENVPS4: Inputs/scei-ps4_tree/target/include{{$}}
 // ENVPS4: Inputs/scei-ps4_tree/target/include_common{{$}}
 
diff --git a/test/Driver/ps4-linker-non-win.c b/test/Driver/ps4-linker-non-win.c
index 1fce6d6077f1e..e2f8386b2c788 100644
--- a/test/Driver/ps4-linker-non-win.c
+++ b/test/Driver/ps4-linker-non-win.c
@@ -2,9 +2,9 @@
 // REQUIRES: x86-registered-target
 
 // RUN: mkdir -p %T/Output
-// RUN: rm -f %T/Output/ps4-ld
-// RUN: touch %T/Output/ps4-ld
-// RUN: chmod +x %T/Output/ps4-ld
+// RUN: rm -f %T/Output/orbis-ld
+// RUN: touch %T/Output/orbis-ld
+// RUN: chmod +x %T/Output/orbis-ld
 
 // RUN: env "PATH=%T/Output:%PATH%" %clang -### -target x86_64-scei-ps4  %s -fuse-ld=gold 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
@@ -18,4 +18,4 @@
 // RUN: env "PATH=%T/Output:%PATH%" %clang -### -target x86_64-scei-ps4  %s -shared \
 // RUN:     -fuse-ld=ps4 2>&1 | FileCheck --check-prefix=CHECK-PS4-LINKER %s
 
-// CHECK-PS4-LINKER: Output/ps4-ld
+// CHECK-PS4-LINKER: /orbis-ld
diff --git a/test/Driver/ps4-linker-win.c b/test/Driver/ps4-linker-win.c
index e42fc963dceea..6fbd84bb6dee1 100644
--- a/test/Driver/ps4-linker-win.c
+++ b/test/Driver/ps4-linker-win.c
@@ -7,21 +7,20 @@
 
 // REQUIRES: system-windows, x86-registered-target
 
-// RUN: touch %T/ps4-ld.exe
-// RUN: touch %T/ps4-ld.gold.exe
+// RUN: touch %T/orbis-ld.exe
+// RUN: touch %T/orbis-ld.gold.exe
 
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -fuse-ld=gold -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -fuse-ld=gold -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-GOLD %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -shared -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -shared -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-GOLD %s
 
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -fuse-ld=ps4 -### 2>&1 \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -fuse-ld=ps4 -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-PS4-LINKER %s
-// RUN: env "PATH=%T;%PATH%" %clang -target x86_64-scei-ps4  %s -shared \
+// RUN: env "PATH=%T;%PATH%;" %clang -target x86_64-scei-ps4  %s -shared \
 // RUN:     -fuse-ld=ps4 -### 2>&1 | FileCheck --check-prefix=CHECK-PS4-LINKER %s
 
-// FIXME: "Output\\" is hardcoded part of %T.
-// CHECK-PS4-GOLD: Output\\ps4-ld.gold.exe"
-// CHECK-PS4-LINKER: Output\\ps4-ld.exe"
+// CHECK-PS4-GOLD: \\orbis-ld.gold
+// CHECK-PS4-LINKER: \\orbis-ld
diff --git a/test/Driver/ps4-pic.c b/test/Driver/ps4-pic.c
index 0cf9ad5f19921..c023dcfd0c355 100644
--- a/test/Driver/ps4-pic.c
+++ b/test/Driver/ps4-pic.c
@@ -6,7 +6,7 @@
 //
 // CHECK-NO-PIC: "-mrelocation-model" "static"
 // CHECK-NO-PIC-NOT: "-pic-level"
-// CHECK-NO-PIC-NOT: "-pie-level"
+// CHECK-NO-PIC-NOT: "-pic-is-pie"
 //
 // CHECK-DYNAMIC-NO-PIC2: unsupported option '-mdynamic-no-pic'
 // CHECK-DYNAMIC-NO-PIC2: "-mrelocation-model" "dynamic-no-pic"
@@ -15,7 +15,7 @@
 // CHECK-PIC2: "-pic-level" "2"
 //
 // CHECK-PIE2: "-mrelocation-model" "pic"
-// CHECK-PIE2: "-pie-level" "2"
+// CHECK-PIE2: "-pic-is-pie"
 //
 // CHECK-NOPIC-IGNORED: using '-fPIC'
 // CHECK-NOPIC-IGNORED: "-mrelocation-model" "pic"
diff --git a/test/Driver/ps4-sdk-root.c b/test/Driver/ps4-sdk-root.c
index f40a963ac9e98..ee22d6c8f0cfe 100644
--- a/test/Driver/ps4-sdk-root.c
+++ b/test/Driver/ps4-sdk-root.c
@@ -1,45 +1,45 @@
 // REQUIRES: x86-registered-target
 
-// Check that ps4-clang doesn't report a warning message when locating
-// system header files (either by looking at the value of SCE_PS4_SDK_DIR
+// Check that PS4 clang doesn't report a warning message when locating
+// system header files (either by looking at the value of SCE_ORBIS_SDK_DIR
 // or relative to the location of the compiler driver), if "-nostdinc",
 // "--sysroot" or "-isysroot" option is specified on the command line.
-// Otherwise, check that ps4-clang reports a warning.
+// Otherwise, check that PS4 clang reports a warning.
 
-// Check that clang doesn't report a warning message when locating
-// system libraries (either by looking at the value of SCE_PS4_SDK_DIR
+// Check that PS4 clang doesn't report a warning message when locating
+// system libraries (either by looking at the value of SCE_ORBIS_SDK_DIR
 // or relative to the location of the compiler driver), if "-c", "-S", "-E",
 // "--sysroot", "-nostdlib" or "-nodefaultlibs" option is specified on
 // the command line.
-// Otherwise, check that ps4-clang reports a warning.
-
-// setting up SCE_PS4_SDK_DIR to existing location, which is not a PS4 SDK.
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
-
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
-
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
-
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
-
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
-// RUN: env SCE_PS4_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// Otherwise, check that PS4 clang reports a warning.
+
+// Setting up SCE_ORBIS_SDK_DIR to existing location, which is not a PS4 SDK.
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
+
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=WARN-SYS-LIBS -check-prefix=NO-WARN %s
+
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -nostdinc -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast --sysroot=foo/ -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=NO-WARN %s
+
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -c -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -S -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -E -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -emit-ast -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### --sysroot=foo/ -isysroot foo -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-ISYSROOT -check-prefix=NO-WARN %s
+
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nostdlib -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
+// RUN: env SCE_ORBIS_SDK_DIR=.. %clang -Winvalid-or-nonexistent-directory -### -nodefaultlibs -target x86_64-scei-ps4 %s 2>&1 | FileCheck -check-prefix=WARN-SYS-HEADERS -check-prefix=NO-WARN %s
 
 // NO-WARN-NOT: {{warning:|error:}}
 // WARN-SYS-HEADERS: warning: unable to find PS4 system headers directory
diff --git a/test/Driver/r600-mcpu.cl b/test/Driver/r600-mcpu.cl
index 4fbec0c83bf91..325e57174c33f 100644
--- a/test/Driver/r600-mcpu.cl
+++ b/test/Driver/r600-mcpu.cl
@@ -38,6 +38,8 @@ t// Check that -mcpu works for all supported GPUs
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=tonga %s -o - 2>&1 | FileCheck --check-prefix=TONGA-CHECK %s
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=iceland %s -o - 2>&1 | FileCheck --check-prefix=ICELAND-CHECK %s
 // RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=carrizo %s -o - 2>&1 | FileCheck --check-prefix=CARRIZO-CHECK %s
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=fiji %s -o - 2>&1 | FileCheck --check-prefix=FIJI-CHECK %s
+// RUN: %clang -### -target amdgcn -x cl -S -emit-llvm -mcpu=stoney %s -o - 2>&1 | FileCheck --check-prefix=STONEY-CHECK %s
 
 // R600-CHECK:  "-target-cpu" "r600"
 // RS880-CHECK: "-target-cpu" "rs880"
@@ -66,3 +68,5 @@ t// Check that -mcpu works for all supported GPUs
 // TONGA-CHECK: "-target-cpu" "tonga"
 // ICELAND-CHECK: "-target-cpu" "iceland"
 // CARRIZO-CHECK: "-target-cpu" "carrizo"
+// FIJI-CHECK: "-target-cpu" "fiji"
+// STONEY-CHECK: "-target-cpu" "stoney"
diff --git a/test/Driver/relax.c b/test/Driver/relax.c
new file mode 100644
index 0000000000000..170d2751b2910
--- /dev/null
+++ b/test/Driver/relax.c
@@ -0,0 +1,4 @@
+// RUN: %clang -### -c -integrated-as -Wa,--mrelax-relocations=yes %s 2>&1 | FileCheck  %s
+
+// CHECK: "-cc1"
+// CHECK: "--mrelax-relocations"
diff --git a/test/Driver/relax.s b/test/Driver/relax.s
new file mode 100644
index 0000000000000..d2941e2f173f3
--- /dev/null
+++ b/test/Driver/relax.s
@@ -0,0 +1,12 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang -### -c -integrated-as -Wa,--mrelax-relocations=yes %s 2>&1 | FileCheck  %s
+
+// CHECK: "-cc1as"
+// CHECK: "--mrelax-relocations"
+
+// RUN: %clang -cc1as -triple x86_64-pc-linux --mrelax-relocations %s -o %t  -filetype obj
+// RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s
+
+// REL: R_X86_64_REX_GOTPCRELX foo
+
+        movq	foo@GOTPCREL(%rip), %rax
diff --git a/test/Driver/renderscript.rs b/test/Driver/renderscript.rs
new file mode 100644
index 0000000000000..84f5dc4de777a
--- /dev/null
+++ b/test/Driver/renderscript.rs
@@ -0,0 +1,3 @@
+// RUN: %clang -### 2>&1 %s | FileCheck %s
+
+// CHECK: "-x" "renderscript"
diff --git a/test/Driver/response-file-extra-whitespace.c b/test/Driver/response-file-extra-whitespace.c
new file mode 100644
index 0000000000000..93b32bb505420
--- /dev/null
+++ b/test/Driver/response-file-extra-whitespace.c
@@ -0,0 +1,12 @@
+// Check that clang is able to process response files with extra whitespace.
+// We generate a dos-style file with \r\n for line endings, and then split
+// some joined arguments (like "-x c") across lines to ensure that regular
+// clang (not clang-cl) can process it correctly.
+//
+// RUN: echo -en "-x\r\nc\r\n-DTEST\r\n" > %t.0.txt
+// RUN: %clang -E @%t.0.txt %s -v 2>&1 | FileCheck %s -check-prefix=SHORT
+// SHORT: extern int it_works;
+
+#ifdef TEST
+extern int it_works;
+#endif
diff --git a/test/Driver/response-file.c b/test/Driver/response-file.c
index 208a941e8723b..bd336309adf8e 100644
--- a/test/Driver/response-file.c
+++ b/test/Driver/response-file.c
@@ -4,7 +4,7 @@
 // Since this is a short response file, clang must not use a response file
 // to pass its parameters to other tools. This is only necessary for a large
 // number of parameters.
-// RUN: echo "-DTEST" >> %t.0.txt
+// RUN: echo "-DTEST" > %t.0.txt
 // RUN: %clang -E @%t.0.txt %s -v 2>&1 | FileCheck %s -check-prefix=SHORT
 // SHORT-NOT: Arguments passed via response file
 // SHORT: extern int it_works;
diff --git a/test/Driver/sanitize_unwind_tables.c b/test/Driver/sanitize_unwind_tables.c
index 8b7889966657c..b78843ef8f2b7 100644
--- a/test/Driver/sanitize_unwind_tables.c
+++ b/test/Driver/sanitize_unwind_tables.c
@@ -7,5 +7,7 @@
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=memory %s -### 2>&1 |  FileCheck %s
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=thread %s -### 2>&1 |  FileCheck %s
 // RUN: %clang -target x86_64-linux-gnu -fsanitize=dataflow %s -### 2>&1 |  FileCheck %s
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-cache-frag %s -### 2>&1 |  FileCheck %s
+// RUN: %clang -target x86_64-linux-gnu -fsanitize=efficiency-working-set %s -### 2>&1 |  FileCheck %s
 
 // CHECK: -munwind-tables
diff --git a/test/Driver/sanitizer-ld.c b/test/Driver/sanitizer-ld.c
index 0e9c596fa2598..4d4ea293a68e1 100644
--- a/test/Driver/sanitizer-ld.c
+++ b/test/Driver/sanitizer-ld.c
@@ -76,7 +76,7 @@
 // CHECK-ASAN-FREEBSD-LDL-NOT: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux -fsanitize=address \
+// RUN:     -target i386-unknown-linux -stdlib=platform -fsanitize=address \
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
@@ -93,8 +93,8 @@
 // CHECK-ASAN-LINUX-CXX: "-ldl"
 
 // RUN: %clang -no-canonical-prefixes %s -### -o /dev/null -fsanitize=address \
-// RUN:     -target i386-unknown-linux --sysroot=%S/Inputs/basic_linux_tree \
-// RUN:     -lstdc++ -static 2>&1 \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree -lstdc++ -static 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX-STATIC %s
 //
 // CHECK-ASAN-LINUX-CXX-STATIC: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
@@ -152,7 +152,8 @@
 // CHECK-ASAN-ANDROID-SHARED-NOT: "-lpthread"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -lstdc++ -fsanitize=thread \
+// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -fsanitize=thread \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-TSAN-LINUX-CXX %s
@@ -170,7 +171,8 @@
 // CHECK-TSAN-LINUX-CXX: "-ldl"
 
 // RUN: %clangxx -no-canonical-prefixes %s -### -o %t.o 2>&1 \
-// RUN:     -target x86_64-unknown-linux -lstdc++ -fsanitize=memory \
+// RUN:     -target x86_64-unknown-linux -stdlib=platform -lstdc++ \
+// RUN:     -fsanitize=memory \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-MSAN-LINUX-CXX %s
@@ -209,7 +211,7 @@
 // CHECK-UBSAN-LINUX-LINK-CXX-NOT: "-lstdc++"
 
 // RUN: %clangxx -fsanitize=undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
 // RUN:     -resource-dir=%S/Inputs/resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-UBSAN-LINUX-CXX %s
@@ -234,7 +236,7 @@
 // CHECK-ASAN-UBSAN-LINUX: "-lpthread"
 
 // RUN: %clangxx -fsanitize=address,undefined %s -### -o %t.o 2>&1 \
-// RUN:     -target i386-unknown-linux \
+// RUN:     -target i386-unknown-linux -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-UBSAN-LINUX-CXX %s
 // CHECK-ASAN-UBSAN-LINUX-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -291,6 +293,54 @@
 // CHECK-LSAN-ASAN-LINUX: libclang_rt.asan-x86_64
 // CHECK-LSAN-ASAN-LINUX-NOT: libclang_rt.lsan
 
+// RUN: %clang -fsanitize=address -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-COV-LINUX %s
+// CHECK-ASAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-ASAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.asan-x86_64.a" "-no-whole-archive"
+// CHECK-ASAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-ASAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-ASAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=memory -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-MSAN-COV-LINUX %s
+// CHECK-MSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-MSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.msan-x86_64.a" "-no-whole-archive"
+// CHECK-MSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-MSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-MSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=dataflow -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-DFSAN-COV-LINUX %s
+// CHECK-DFSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-DFSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.dfsan-x86_64.a" "-no-whole-archive"
+// CHECK-DFSAN-COV-LINUX-NOT: libclang_rt.ubsan
+// CHECK-DFSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-DFSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize=undefined -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-UBSAN-COV-LINUX %s
+// CHECK-UBSAN-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-UBSAN-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "-no-whole-archive"
+// CHECK-UBSAN-COV-LINUX-NOT: "-lstdc++"
+// CHECK-UBSAN-COV-LINUX: "-lpthread"
+
+// RUN: %clang -fsanitize-coverage=func %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-COV-LINUX %s
+// CHECK-COV-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-COV-LINUX: "-whole-archive" "{{.*}}libclang_rt.ubsan_standalone-x86_64.a" "-no-whole-archive"
+// CHECK-COV-LINUX-NOT: "-lstdc++"
+// CHECK-COV-LINUX: "-lpthread"
+
 // CFI by itself does not link runtime libraries.
 // RUN: %clang -fsanitize=cfi %s -### -o %t.o 2>&1 \
 // RUN:     -target x86_64-unknown-linux \
@@ -327,7 +377,7 @@
 
 // RUN: %clangxx -fsanitize=address %s -### -o %t.o 2>&1 \
 // RUN:     -mmacosx-version-min=10.6 \
-// RUN:     -target x86_64-apple-darwin13.4.0 \
+// RUN:     -target x86_64-apple-darwin13.4.0 -stdlib=platform \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-DARWIN106-CXX %s
 // CHECK-ASAN-DARWIN106-CXX: "{{.*}}ld{{(.exe)?}}"
@@ -345,6 +395,39 @@
 // CHECK-SAFESTACK-LINUX: "-lpthread"
 // CHECK-SAFESTACK-LINUX: "-ldl"
 
+// RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-LINUX %s
+// CHECK-CFI-STATS-LINUX: "{{.*}}ld{{(.exe)?}}"
+// CHECK-CFI-STATS-LINUX: "-whole-archive" "{{[^"]*}}libclang_rt.stats_client-x86_64.a" "-no-whole-archive"
+// CHECK-CFI-STATS-LINUX-NOT: "-whole-archive"
+// CHECK-CFI-STATS-LINUX: "{{[^"]*}}libclang_rt.stats-x86_64.a"
+
+// RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-apple-darwin \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-DARWIN %s
+// CHECK-CFI-STATS-DARWIN: "{{.*}}ld{{(.exe)?}}"
+// CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_client_osx.a"
+// CHECK-CFI-STATS-DARWIN: "{{[^"]*}}libclang_rt.stats_osx_dynamic.dylib"
+
+// RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-pc-windows \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN64 %s
+// CHECK-CFI-STATS-WIN64: "--dependent-lib={{[^"]*}}clang_rt.stats_client-x86_64.lib"
+// CHECK-CFI-STATS-WIN64: "--dependent-lib={{[^"]*}}clang_rt.stats-x86_64.lib"
+// CHECK-CFI-STATS-WIN64: "--linker-option=/include:__sanitizer_stats_register"
+
+// RUN: %clang -fsanitize=cfi -fsanitize-stats %s -### -o %t.o 2>&1 \
+// RUN:     -target i686-pc-windows \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-CFI-STATS-WIN32 %s
+// CHECK-CFI-STATS-WIN32: "--dependent-lib={{[^"]*}}clang_rt.stats_client-i386.lib"
+// CHECK-CFI-STATS-WIN32: "--dependent-lib={{[^"]*}}clang_rt.stats-i386.lib"
+// CHECK-CFI-STATS-WIN32: "--linker-option=/include:___sanitizer_stats_register"
+
 // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
 // RUN:     -target arm-linux-androideabi -fsanitize=safe-stack \
 // RUN:     --sysroot=%S/Inputs/basic_android_tree \
@@ -389,3 +472,13 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AUBSAN-PS4 %s
 // CHECK-AUBSAN-PS4: "{{.*}}ld{{(.gold)?(.exe)?}}"
 // CHECK-AUBSAN-PS4: -lSceDbgAddressSanitizer_stub_weak
+
+// RUN: %clang -fsanitize=efficiency-cache-frag %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
+// RUN: %clang -fsanitize=efficiency-working-set %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux \
+// RUN:   | FileCheck --check-prefix=CHECK-ESAN-LINUX %s
+//
+// CHECK-ESAN-LINUX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-ESAN-LINUX: libclang_rt.esan-x86_64.a
diff --git a/test/Driver/save-temps.c b/test/Driver/save-temps.c
index c974d1582c320..29d1b7d9ac8d7 100644
--- a/test/Driver/save-temps.c
+++ b/test/Driver/save-temps.c
@@ -77,3 +77,8 @@
 // CHECK-OBJ-NOO: "-o" "save-temps.s"
 // CHECK-OBJ-NOO: "-o" "save-temps.o"
 // CHECK-OBJ-NOO: "-o" "a.out"
+
+// RUN: %clang -target i386-unknown-freebsd -save-temps -g -c %s -### 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=CHECK-SAVE-TEMPS
+// CHECK-SAVE-TEMPS: "-cc1as"
+// CHECK-SAVE-TEMPS: "-dwarf-version={{.}}"
diff --git a/test/Driver/sparc-as.c b/test/Driver/sparc-as.c
index 5b939956cb38f..80122cf6dc120 100644
--- a/test/Driver/sparc-as.c
+++ b/test/Driver/sparc-as.c
@@ -76,6 +76,38 @@
 // RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
 // RUN: | FileCheck -check-prefix=SPARC-V8PLUSD %s
 
+// RUN: %clang -mcpu=leon2 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=at697e -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=at697f -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=leon3 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=ut699 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=gr712rc -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=leon4 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
+// RUN: %clang -mcpu=gr740 -no-canonical-prefixes -target sparc \
+// RUN: -no-integrated-as --sysroot=%S/Inputs/basic_netbsd_tree %s -### 2>&1 \
+// RUN: | FileCheck -check-prefix=SPARC-V8 %s
+
 // SPARC: as{{.*}}" "-32" "-Av8" "-o"
 // SPARC-V8: as{{.*}}" "-32" "-Av8" "-o"
 // SPARC-SPARCLITE: as{{.*}}" "-32" "-Asparclite" "-o"
diff --git a/test/Driver/sparc-float.c b/test/Driver/sparc-float.c
index 6fa47f00cc7a3..c205f5db17ec1 100644
--- a/test/Driver/sparc-float.c
+++ b/test/Driver/sparc-float.c
@@ -18,7 +18,25 @@
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
 // RUN:     -target sparc-linux-gnu -msoft-float \
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT %s
-// CHECK-SOFT: error: unsupported option '-msoft-float'
+// CHECK-SOFT: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=soft
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=soft \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABISOFT %s
+// CHECK-FLOATABISOFT: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=hard
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=hard \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABIHARD %s
+// CHECK-FLOATABIHARD-NOT: "-target-feature" "+soft-float"
+//
+// check invalid -mfloat-abi
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc-linux-gnu -mfloat-abi=x \
+// RUN:   | FileCheck --check-prefix=CHECK-ERRMSG %s
+// CHECK-ERRMSG: error: invalid float ABI '-mfloat-abi=x'
 //
 // Default sparc64
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
@@ -37,4 +55,22 @@
 // RUN: %clang -c %s -### -o %t.o 2>&1 \
 // RUN:     -target sparc64-linux-gnu -msoft-float \
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT-SPARC64 %s
-// CHECK-SOFT-SPARC64: error: unsupported option '-msoft-float'
+// CHECK-SOFT-SPARC64: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=soft
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=soft \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABISOFT64 %s
+// CHECK-FLOATABISOFT64: "-target-feature" "+soft-float"
+//
+// -mfloat-abi=hard
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=hard \
+// RUN:   | FileCheck --check-prefix=CHECK-FLOATABIHARD64 %s
+// CHECK-FLOATABIHARD64-NOT: "-target-feature" "+soft-float"
+//
+// check invalid -mfloat-abi
+// RUN: %clang -c %s -### -o %t.o 2>&1 \
+// RUN:     -target sparc64-linux-gnu -mfloat-abi=x \
+// RUN:   | FileCheck --check-prefix=CHECK-ERRMSG64 %s
+// CHECK-ERRMSG64: error: invalid float ABI '-mfloat-abi=x'
diff --git a/test/Driver/split-debug.h b/test/Driver/split-debug.h
new file mode 100644
index 0000000000000..bb05f30b67541
--- /dev/null
+++ b/test/Driver/split-debug.h
@@ -0,0 +1,15 @@
+// Check that we aren't splitting debug output for modules builds that don't produce object files.
+//
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -emit-module -fmodules-embed-all-files -fno-implicit-modules -fno-implicit-module-maps -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// FIXME: This should fail using clang, except that the type of the output for
+// an object output with modules is given as clang::driver::types::TY_PCH
+// rather than TY_Object.
+// RUN: %clang -target x86_64-unknown-linux-gnu -gsplit-dwarf -c -fmodules -fmodule-format=obj -### %s 2> %t
+// RUN: FileCheck -check-prefix=CHECK-NO-ACTIONS < %t %s
+//
+// CHECK-NO-ACTIONS-NOT: objcopy
diff --git a/test/Driver/split-stack-ld.c b/test/Driver/split-stack-ld.c
new file mode 100644
index 0000000000000..3441d542cb83c
--- /dev/null
+++ b/test/Driver/split-stack-ld.c
@@ -0,0 +1,17 @@
+// Test split stack ld flags.
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target i386-unknown-linux -fsplit-stack \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LINUX-I386 %s
+//
+// CHECK-LINUX-I386: "--wrap=pthread_create"
+//
+// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \
+// RUN:     -target x86_64-unknown-linux -fsplit-stack \
+// RUN:     -resource-dir=%S/Inputs/resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:   | FileCheck --check-prefix=CHECK-LINUX-X86-64 %s
+//
+// CHECK-LINUX-X86-64: "--wrap=pthread_create"
diff --git a/test/Driver/unknown-arg.c b/test/Driver/unknown-arg.c
index f834a0e8db92a..755d29f1089da 100644
--- a/test/Driver/unknown-arg.c
+++ b/test/Driver/unknown-arg.c
@@ -1,13 +1,35 @@
-// RUN: not %clang %s -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option 2>&1 | \
+// RUN: %clang %s -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### 2>&1 | \
 // RUN: FileCheck %s
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -### -c -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Werror=unknown-argument -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=CL-ERROR
+// RUN: %clang_cl -cake-is-lie -%0 -%d -HHHH -munknown-to-clang-option -print-stats -funknown-to-clang-option -c -Wno-unknown-argument -### -- %s 2>&1 | \
+// RUN: FileCheck %s --check-prefix=SILENT
 
-// CHECK: unknown argument: '-cake-is-lie'
-// CHECK: unknown argument: '-%0'
-// CHECK: unknown argument: '-%d'
-// CHECK: unknown argument: '-HHHH'
-// CHECK: unknown argument: '-munknown-to-clang-option'
-// CHECK: unknown argument: '-print-stats'
-// CHECK: unknown argument: '-funknown-to-clang-option'
+// CHECK: error: unknown argument: '-cake-is-lie'
+// CHECK: error: unknown argument: '-%0'
+// CHECK: error: unknown argument: '-%d'
+// CHECK: error: unknown argument: '-HHHH'
+// CHECK: error: unknown argument: '-munknown-to-clang-option'
+// CHECK: error: unknown argument: '-print-stats'
+// CHECK: error: unknown argument: '-funknown-to-clang-option'
+// CL: warning: unknown argument ignored in clang-cl: '-cake-is-lie'
+// CL: warning: unknown argument ignored in clang-cl: '-%0'
+// CL: warning: unknown argument ignored in clang-cl: '-%d'
+// CL: warning: unknown argument ignored in clang-cl: '-HHHH'
+// CL: warning: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
+// CL: warning: unknown argument ignored in clang-cl: '-print-stats'
+// CL: warning: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-cake-is-lie'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-%0'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-%d'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-HHHH'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-munknown-to-clang-option'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-print-stats'
+// CL-ERROR: error: unknown argument ignored in clang-cl: '-funknown-to-clang-option'
+// SILENT-NOT: error:
+// SILENT-NOT: warning:
 
 
 // RUN: %clang -S %s -o %t.s  -Wunknown-to-clang-option 2>&1 | FileCheck --check-prefix=IGNORED %s
diff --git a/test/Driver/wasm-toolchain.c b/test/Driver/wasm-toolchain.c
index b9685b1601923..d0b0293038912 100644
--- a/test/Driver/wasm-toolchain.c
+++ b/test/Driver/wasm-toolchain.c
@@ -25,20 +25,20 @@
 
 // A basic C link command-line.
 
-// RUN: %clang -### -no-canonical-prefixes -target wasm32-unknown-unknown %s 2>&1 | FileCheck -check-prefix=LINK %s
+// RUN: %clang -### -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK %s
 // LINK: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK: lld{{.*}}" "-flavor" "ld" "[[temp]]" "-o" "a.out"
+// LINK: lld{{.*}}" "-flavor" "ld" "-L/foo/lib32" "crt1.o" "crti.o" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
 
 // A basic C link command-line with optimization. WebAssembly is somewhat
 // special in enabling --gc-sections by default.
 
-// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown %s 2>&1 | FileCheck -check-prefix=LINK_OPT %s
+// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo %s 2>&1 | FileCheck -check-prefix=LINK_OPT %s
 // LINK_OPT: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// LINK_OPT: lld{{.*}}" "-flavor" "ld" "--gc-sections" "[[temp]]" "-o" "a.out"
+// LINK_OPT: lld{{.*}}" "-flavor" "ld" "--gc-sections" "-L/foo/lib32" "crt1.o" "crti.o" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
 
 // Ditto, but ensure that a user --no-gc-sections comes after the
 // default --gc-sections.
 
-// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown -Wl,--no-gc-sections %s 2>&1 | FileCheck -check-prefix=NO_GC_SECTIONS %s
+// RUN: %clang -### -O2 -no-canonical-prefixes -target wasm32-unknown-unknown --sysroot=/foo -Wl,--no-gc-sections %s 2>&1 | FileCheck -check-prefix=NO_GC_SECTIONS %s
 // NO_GC_SECTIONS: clang{{.*}}" "-cc1" {{.*}} "-o" "[[temp:[^"]*]]"
-// NO_GC_SECTIONS: lld{{.*}}" "-flavor" "ld" "--gc-sections" "--no-gc-sections" "[[temp]]" "-o" "a.out"
+// NO_GC_SECTIONS: lld{{.*}}" "-flavor" "ld" "--gc-sections" "-L/foo/lib32" "crt1.o" "crti.o" "--no-gc-sections" "[[temp]]" "-lc" "-lcompiler_rt" "crtn.o" "-o" "a.out"
diff --git a/test/Driver/whole-program-vtables.c b/test/Driver/whole-program-vtables.c
new file mode 100644
index 0000000000000..4ca985e6d71de
--- /dev/null
+++ b/test/Driver/whole-program-vtables.c
@@ -0,0 +1,2 @@
+// RUN: %clang -target x86_64-unknown-linux -fwhole-program-vtables -### %s 2>&1 | FileCheck --check-prefix=NO-LTO %s
+// NO-LTO: invalid argument '-fwhole-program-vtables' only allowed with '-flto'
diff --git a/test/Driver/win-macho-unwind.c b/test/Driver/win-macho-unwind.c
new file mode 100644
index 0000000000000..a2895d2fe8c0d
--- /dev/null
+++ b/test/Driver/win-macho-unwind.c
@@ -0,0 +1,4 @@
+// RUN: %clang -target x86_64-pc-win32-macho -### -S %s -o %t.s 2>&1 | FileCheck %s
+ 
+// Do not add function attribute "uwtable" for macho targets.
+// CHECK-NOT: -munwind-tables
diff --git a/test/Driver/windows-cross.c b/test/Driver/windows-cross.c
index d355fbcb4982d..3812287c8aa9d 100644
--- a/test/Driver/windows-cross.c
+++ b/test/Driver/windows-cross.c
@@ -1,9 +1,9 @@
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -stdlib=libstdc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-BASIC
 
 // CHECK-BASIC: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "-lgcc" "--as-needed" "-lgcc_s" "--no-as-needed"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -o /dev/null %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %s/Inputs/Windows/ARM/8.1 -B %S/Inputs/Windows/ARM/8.1/usr/bin -rtlib=compiler-rt -stdlib=libstdc++ -o /dev/null %s 2>&1 \
 // RUN:   | FileCheck %s --check-prefix CHECK-RTLIB
 
 // CHECK-RTLIB: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-Bdynamic" "--entry" "mainCRTStartup" "--allow-multiple-definition" "-o" "{{[^"]*}}" "{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/crtbegin.obj" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib" "-L{{.*}}/Inputs/Windows/ARM/8.1/usr/lib/gcc" "{{.*}}.o" "-lmsvcrt" "{{.*[\\/]}}clang_rt.builtins-arm.lib"
@@ -33,7 +33,7 @@
 
 // CHECK-STANDALONE: armv7-windows-itanium-ld" "--sysroot={{.*}}/Inputs/Windows/ARM/8.1" "-m" "thumb2pe" "-shared" "-Bdynamic" "--enable-auto-image-base" "--entry" "_DllMainCRTStartup" "--allow-multiple-definition" "-o" "shared.dll" "--out-implib" "shared.lib" "{{.*}}.o"
 
-// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %/Inputs/Windows/ARM/8.1/usr/bin -shared -o shared.dll -x c++ %s 2>&1 \
+// RUN: %clang -### -target armv7-windows-itanium --sysroot %S/Inputs/Windows/ARM/8.1 -B %/Inputs/Windows/ARM/8.1/usr/bin -stdlib=libstdc++ -shared -o shared.dll -x c++ %s 2>&1 \
 // RUN:    | FileCheck %s --check-prefix CHECK-LIBSTDCXX
 
 // CHECK-LIBSTDCXX:  "-internal-isystem" "{{.*}}/usr/include/c++" "-internal-isystem" "{{.*}}/usr/include/c++/armv7--windows-itanium" "-internal-isystem" "{{.*}}/usr/include/c++/backwards"
@@ -67,3 +67,11 @@
 // CHECK-SANITIZE-TSAN: error: unsupported argument 'tsan' to option 'fsanitize='
 // CHECK-SANITIZE-TSAN-NOT: "-fsanitize={{.*}}"
 
+// RUN: %clang -### -target armv7-windows-itanium -isystem-after "Windows Kits/10/Include/10.0.10586.0/ucrt" -isystem-after "Windows Kits/10/Include/10.0.10586.0/um" -isystem-after "Windows Kits/10/Include/10.0.10586.0/shared" -c %s -o /dev/null 2>&1 \
+// RUN:     | FileCheck %s --check-prefix CHECK-ISYSTEM-AFTER
+// CHECK-ISYSTEM-AFTER: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "[[RESOURCE_DIR]]{{(/|\\\\)}}include"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}ucrt"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}um"
+// CHECK-ISYSTEM-AFTER: "-internal-isystem" "Windows Kits{{[/\\]}}10{{[/\\]}}Include{{[/\\]}}10.0.10586.0{{[/\\]}}shared"
+
diff --git a/test/Driver/x86-target-features.c b/test/Driver/x86-target-features.c
new file mode 100644
index 0000000000000..ce35b2cfd0dc1
--- /dev/null
+++ b/test/Driver/x86-target-features.c
@@ -0,0 +1,51 @@
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mx87 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-x87 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -m80387 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=X87 %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-80387 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-X87 %s
+// X87: "-target-feature" "+x87"
+// NO-X87: "-target-feature" "-x87"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mmmx -m3dnow -m3dnowa %s -### -o %t.o 2>&1 | FileCheck -check-prefix=MMX %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-mmx -mno-3dnow -mno-3dnowa %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-MMX %s
+// MMX: "-target-feature" "+mmx" "-target-feature" "+3dnow" "-target-feature" "+3dnowa"
+// NO-MMX: "-target-feature" "-mmx" "-target-feature" "-3dnow" "-target-feature" "-3dnowa"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msse -msse2 -msse3 -mssse3 -msse4a -msse4.1 -msse4.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SSE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sse -mno-sse2 -mno-sse3 -mno-ssse3 -mno-sse4a -mno-sse4.1 -mno-sse4.2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SSE %s
+// SSE: "-target-feature" "+sse" "-target-feature" "+sse2" "-target-feature" "+sse3" "-target-feature" "+ssse3" "-target-feature" "+sse4a" "-target-feature" "+sse4.1" "-target-feature" "+sse4.2"
+// NO-SSE: "-target-feature" "-sse" "-target-feature" "-sse2" "-target-feature" "-sse3" "-target-feature" "-ssse3" "-target-feature" "-sse4a" "-target-feature" "-sse4.1" "-target-feature" "-sse4.2"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msse4 -maes %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SSE4-AES %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sse4 -mno-aes %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SSE4-AES %s
+// SSE4-AES: "-target-feature" "+sse4.2" "-target-feature" "+aes"
+// NO-SSE4-AES: "-target-feature" "-sse4.1" "-target-feature" "-aes"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavx -mavx2 -mavx512f -mavx512cd -mavx512er -mavx512pf -mavx512dq -mavx512bw -mavx512vl -mavx512vbmi -mavx512ifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=AVX %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avx -mno-avx2 -mno-avx512f -mno-avx512cd -mno-avx512er -mno-avx512pf -mno-avx512dq -mno-avx512bw -mno-avx512vl -mno-avx512vbmi -mno-avx512ifma %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-AVX %s
+// AVX: "-target-feature" "+avx" "-target-feature" "+avx2" "-target-feature" "+avx512f" "-target-feature" "+avx512cd" "-target-feature" "+avx512er" "-target-feature" "+avx512pf" "-target-feature" "+avx512dq" "-target-feature" "+avx512bw" "-target-feature" "+avx512vl" "-target-feature" "+avx512vbmi" "-target-feature" "+avx512ifma"
+// NO-AVX: "-target-feature" "-avx" "-target-feature" "-avx2" "-target-feature" "-avx512f" "-target-feature" "-avx512cd" "-target-feature" "-avx512er" "-target-feature" "-avx512pf" "-target-feature" "-avx512dq" "-target-feature" "-avx512bw" "-target-feature" "-avx512vl" "-target-feature" "-avx512vbmi" "-target-feature" "-avx512ifma"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mpclmul -mrdrnd -mfsgsbase -mbmi -mbmi2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=BMI %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-pclmul -mno-rdrnd -mno-fsgsbase -mno-bmi -mno-bmi2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-BMI %s
+// BMI: "-target-feature" "+pclmul" "-target-feature" "+rdrnd" "-target-feature" "+fsgsbase" "-target-feature" "+bmi" "-target-feature" "+bmi2"
+// NO-BMI: "-target-feature" "-pclmul" "-target-feature" "-rdrnd" "-target-feature" "-fsgsbase" "-target-feature" "-bmi" "-target-feature" "-bmi2"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mlzcnt -mpopcnt -mtbm -mfma -mfma4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=FMA %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-lzcnt -mno-popcnt -mno-tbm -mno-fma -mno-fma4 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-FMA %s
+// FMA: "-target-feature" "+lzcnt" "-target-feature" "+popcnt" "-target-feature" "+tbm" "-target-feature" "+fma" "-target-feature" "+fma4"
+// NO-FMA: "-target-feature" "-lzcnt" "-target-feature" "-popcnt" "-target-feature" "-tbm" "-target-feature" "-fma" "-target-feature" "-fma4"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mxop -mf16c -mrtm -mprfchw -mrdseed %s -### -o %t.o 2>&1 | FileCheck -check-prefix=XOP %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-xop -mno-f16c -mno-rtm -mno-prfchw -mno-rdseed %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-XOP %s
+// XOP: "-target-feature" "+xop" "-target-feature" "+f16c" "-target-feature" "+rtm" "-target-feature" "+prfchw" "-target-feature" "+rdseed"
+// NO-XOP: "-target-feature" "-xop" "-target-feature" "-f16c" "-target-feature" "-rtm" "-target-feature" "-prfchw" "-target-feature" "-rdseed"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -msha -mpku -madx -mcx16 -mfxsr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=SHA %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-sha -mno-pku -mno-adx -mno-cx16 -mno-fxsr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-SHA %s
+// SHA: "-target-feature" "+sha" "-target-feature" "+pku" "-target-feature" "+adx" "-target-feature" "+cx16" "-target-feature" "+fxsr"
+// NO-SHA: "-target-feature" "-sha" "-target-feature" "-pku" "-target-feature" "-adx" "-target-feature" "-cx16" "-target-feature" "-fxsr"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mxsave -mxsaveopt -mxsavec -mxsaves %s -### -o %t.o 2>&1 | FileCheck -check-prefix=XSAVE %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-xsave -mno-xsaveopt -mno-xsavec -mno-xsaves %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-XSAVE %s
+// XSAVE: "-target-feature" "+xsave" "-target-feature" "+xsaveopt" "-target-feature" "+xsavec" "-target-feature" "+xsaves"
+// NO-XSAVE: "-target-feature" "-xsave" "-target-feature" "-xsaveopt" "-target-feature" "-xsavec" "-target-feature" "-xsaves"
diff --git a/test/FixIt/fixit-errors.c b/test/FixIt/fixit-errors.c
index d727adb6fdfbe..1ac9d1c643206 100644
--- a/test/FixIt/fixit-errors.c
+++ b/test/FixIt/fixit-errors.c
@@ -22,6 +22,8 @@ void test_point() {
   (void)get_origin->x; // expected-error {{base of member reference is a function; perhaps you meant to call it with no arguments?}}
 }
 
+// These errors require C11.
+#if __STDC_VERSION__ > 199901L
 void noreturn_1() _Noreturn; // expected-error {{must precede function declarator}}
 void noreturn_1() {
   return; // expected-warning {{should not return}}
@@ -29,3 +31,4 @@ void noreturn_1() {
 void noreturn_2() _Noreturn { // expected-error {{must precede function declarator}}
   return; // expected-warning {{should not return}}
 }
+#endif
diff --git a/test/FixIt/fixit-interface-as-param.m b/test/FixIt/fixit-interface-as-param.m
index 75da5720d7f01..748d83e07005a 100644
--- a/test/FixIt/fixit-interface-as-param.m
+++ b/test/FixIt/fixit-interface-as-param.m
@@ -1,11 +1,20 @@
-// RUN: not %clang_cc1 -triple x86_64-apple-darwin10  -fdiagnostics-parseable-fixits -x objective-c %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -triple x86_64-apple-darwin10 -fblocks -fdiagnostics-parseable-fixits -x objective-c %s 2>&1 | FileCheck %s
 // rdar://11311333
 
 @interface NSView @end
 
 @interface INTF
 - (void) drawRect : inView:(NSView)view;
+- (void)test:(NSView )a;
+- (void)foo;
 @end
 
 // CHECK: {7:35-7:35}:"*"
-
+// CHECK: {8:21-8:21}:"*"
+@implementation INTF
+-(void)foo {
+  ^(NSView view) {
+  };
+}
+@end
+// CHECK: {16:11-16:11}:"*"
diff --git a/test/FixIt/fixit-objc.m b/test/FixIt/fixit-objc.m
index f41f75f1d7be9..3e9ff605233b8 100644
--- a/test/FixIt/fixit-objc.m
+++ b/test/FixIt/fixit-objc.m
@@ -67,3 +67,11 @@ void sentinel_test(Sentinel *a) {
   sentinel(1, 2, 3); // expected-warning{{missing sentinel in function call}}
   [a sentinel:1, 2, 3]; // expected-warning{{missing sentinel in method dispatch}}
 }
+
+@interface A
+@property (class) int c;
+@end
+
+int test(A *a) {
+  return a.c; // expected-error {{property 'c' is a class property; did you mean to access it with class 'A'}}
+}
diff --git a/test/FixIt/fixit-vexing-parse.cpp b/test/FixIt/fixit-vexing-parse.cpp
index 0232f5dbb04e9..71d3eff5329ad 100644
--- a/test/FixIt/fixit-vexing-parse.cpp
+++ b/test/FixIt/fixit-vexing-parse.cpp
@@ -60,7 +60,7 @@ namespace N {
     VO m(int (*p)[4]);
 
     // Don't emit warning and fixit because direct initializer is not permitted here.
-    if (int n(int())){} // expected-error {{function type is not allowed here}} expected-error {{condition must have an initializer}}
+    if (int n(int())){} // expected-error {{function type is not allowed here}}
 
     // CHECK: fix-it:"{{.*}}":{66:8-66:10}:" = {}"
     U u(); // expected-warning {{function declaration}} expected-note {{replace parentheses with an initializer}}
diff --git a/test/FixIt/typo.m b/test/FixIt/typo.m
index 381233f95c29a..53afe7257d2a0 100644
--- a/test/FixIt/typo.m
+++ b/test/FixIt/typo.m
@@ -103,7 +103,7 @@ void test2(Collide *a) {
 @end
 
 @interface Sub : Super
-- (int)method;
+- (int)method; // expected-note{{also found}}
 @end
 
 @implementation Sub
@@ -113,8 +113,6 @@ void test2(Collide *a) {
   
 @end
 
-double *isupper(int);
-
 @interface Sub2 : Super
 - (int)method2;
 @end
diff --git a/test/Frontend/backend-option.c b/test/Frontend/backend-option.c
new file mode 100644
index 0000000000000..e17757422e4ba
--- /dev/null
+++ b/test/Frontend/backend-option.c
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 %s -emit-llvm -backend-option -time-passes -o - 2>&1 | FileCheck %s
+// RUN: %clang_cc1 %s -emit-llvm -backend-option -time-passes -o - -triple spir-unknown-unknown 2>&1 | FileCheck %s
+// CHECK: Pass execution timing report
+
diff --git a/test/Frontend/dependency-gen.c b/test/Frontend/dependency-gen.c
index 054aa79e3d495..e4b0feea16a96 100644
--- a/test/Frontend/dependency-gen.c
+++ b/test/Frontend/dependency-gen.c
@@ -21,7 +21,7 @@
 // RUN: %clang -MD -MF - %s -fsyntax-only -I ./ | FileCheck -check-prefix=CHECK-SIX %s
 // CHECK-SIX: {{ }}x.h
 // RUN: echo "fun:foo" > %t.blacklist
-// RUN: %clang -MD -MF - %s -fsyntax-only -fsanitize=cfi-vcall -flto -fsanitize-blacklist=%t.blacklist -I ./ | FileCheck -check-prefix=CHECK-SEVEN %s
+// RUN: %clang -MD -MF - %s -fsyntax-only -fsanitize=cfi-vcall -flto -fvisibility=hidden -fsanitize-blacklist=%t.blacklist -I ./ | FileCheck -check-prefix=CHECK-SEVEN %s
 // CHECK-SEVEN: .blacklist
 // CHECK-SEVEN: {{ }}x.h
 #ifndef INCLUDE_FLAG_TEST
diff --git a/test/Frontend/embed-bitcode.ll b/test/Frontend/embed-bitcode.ll
new file mode 100644
index 0000000000000..bd2afb44bb0fd
--- /dev/null
+++ b/test/Frontend/embed-bitcode.ll
@@ -0,0 +1,59 @@
+; REQUIRES: arm-registered-target
+; REQUIRES: aarch64-registered-target
+; check .ll input
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN:    | FileCheck %s
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=marker -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
+; RUN: %clang_cc1 -triple aarch64-unknown-linux-gnueabi -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ELF
+
+; check .bc input
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm-bc \
+; RUN:    -x ir %s -o %t.bc
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir %t.bc -o - \
+; RUN:    | FileCheck %s
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=bitcode -x ir %t.bc -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-ONLY-BITCODE
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=marker -x ir %t.bc -o - \
+; RUN:    | FileCheck %s -check-prefix=CHECK-MARKER
+
+; run through -fembed-bitcode twice and make sure it doesn't crash
+; RUN: %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm-bc \
+; RUN:    -fembed-bitcode=all -x ir %s -o - \
+; RUN: | %clang_cc1 -triple thumbv7-apple-ios8.0.0 -emit-llvm \
+; RUN:    -fembed-bitcode=all -x ir - -o /dev/null
+
+; check the magic number of bitcode at the beginning of the string
+; CHECK: @llvm.embedded.module = private constant
+; CHECK: c"\DE\C0\17\0B
+; CHECK: section "__LLVM,__bitcode"
+; CHECK: @llvm.cmdline = private constant
+; CHECK: section "__LLVM,__cmdline"
+
+; CHECK-ELF: @llvm.embedded.module
+; CHECK-ELF: section ".llvmbc"
+; CHECK-ELF: @llvm.cmdline
+; CHECK-ELF: section ".llvmcmd"
+
+; CHECK-ONLY-BITCODE: @llvm.embedded.module = private constant
+; CHECK-ONLY-BITCODE: c"\DE\C0\17\0B
+; CHECK-ONLY-BITCODE: section "__LLVM,__bitcode"
+; CHECK-ONLY-BITCODE-NOT: @llvm.cmdline = private constant
+; CHECK-ONLY-BITCODE-NOT: section "__LLVM,__cmdline"
+
+; CHECK-MARKER: @llvm.embedded.module
+; CHECK-MARKER: constant [0 x i8] zeroinitializer
+; CHECK-MARKER: section "__LLVM,__bitcode"
+; CHECK-MARKER: @llvm.cmdline
+; CHECK-MARKER: section "__LLVM,__cmdline"
+
+define i32 @f0() {
+  ret i32 0
+}
diff --git a/test/Frontend/gnu-mcount.c b/test/Frontend/gnu-mcount.c
new file mode 100644
index 0000000000000..c279b89ddadf8
--- /dev/null
+++ b/test/Frontend/gnu-mcount.c
@@ -0,0 +1,78 @@
+// RUN: %clang -target armv7-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-none-eabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-none-eabi -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-linux-gnueabi -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-linux-gnueabi -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI
+// RUN: %clang -target armv7-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-MEABI-GNU
+// RUN: %clang -target aarch64-unknown-linux-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI
+// RUN: %clang -target aarch64-unknown-linux-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-MEABI-GNU
+// RUN: %clang -target armv7-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-FREEBSD
+// RUN: %clang -target armv7-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-FREEBSD
+// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang -target aarch64-unknown-freebsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-FREEBSD
+// RUN: %clang -target armv7-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-OPENBSD
+// RUN: %clang -target armv7-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-OPENBSD
+// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-OPENBSD
+// RUN: %clang -target aarch64-unknown-openbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-OPENBSD
+// RUN: %clang -target armv7-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-NETBSD
+// RUN: %clang -target armv7-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-NETBSD
+// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-NETBSD
+// RUN: %clang -target aarch64-unknown-netbsd-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-NETBSD
+// RUN: %clang -target armv7-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target armv7-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target arm64-apple-ios -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target arm64-apple-ios -pg -meabi gnu -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-IOS
+// RUN: %clang -target armv7-unknown-bitrig-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-BIGRIG
+// RUN: %clang -target armv7-unknown-bitrig-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-BIGRIG
+// RUN: %clang -target aarch64-unknown-bitrig-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-BITRIG
+// RUN: %clang -target aarch64-unknown-bitrig-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-BITRIG
+// RUN: %clang -target armv7-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-RTEMS
+// RUN: %clang -target armv7-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-RTEMS
+// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-RTEMS
+// RUN: %clang -target aarch64-unknown-rtems-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-RTEMS
+// RUN: %clang -target armv7-unknown-cloudabi-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-CLOUDABI
+// RUN: %clang -target armv7-unknown-cloudabi-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM-EABI-CLOUDABI
+// RUN: %clang -target aarch64-unknown-cloudabi-gnueabihf -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-CLOUDABI
+// RUN: %clang -target aarch64-unknown-cloudabi-gnueabihf -meabi gnu -pg -S -emit-llvm -o - %s | FileCheck %s -check-prefix CHECK -check-prefix CHECK-ARM64-EABI-CLOUDABI
+
+int f() {
+  return 0;
+}
+
+// CHECK-LABEL: f
+// CHECK-ARM-IOS-NOT: call void @_mcount()
+// CHECK-ARM-IOS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI: call void @"\01mcount"()
+// CHECK-ARM-EABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI: call void @mcount()
+// CHECK-ARM64-EABI-MEABI-GNU: call void @"\01_mcount"()
+// CHECK-ARM64-EABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-FREEBSD: call void @__mcount()
+// CHECK-ARM-EABI-FREEBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-FREEBSD: call void @.mcount()
+// CHECK-ARM64-EABI-FREEBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-NETBSD: call void @_mcount()
+// CHECK-ARM-EABI-NETBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-OPENBSD: call void @__mcount()
+// CHECK-ARM-EABI-OPENBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-OPENBSD: call void @mcount()
+// CHECK-ARM64-EABI-OPENBSD-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-MEABI-GNU-NOT: call void @mcount()
+// CHECK-ARM-EABI-MEABI-GNU: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-BITRIG: call void @__mcount()
+// CHECK-ARM-EABI-BITRIG-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM54-EABI-BITRIG: call void @mcount()
+// CHECK-ARM54-EABI-BITRIG-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-RTEMS: call void @mcount()
+// CHECK-ARM-EABI-RTEMS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-RTEMS: call void @mcount()
+// CHECK-ARM64-EABI-RTEMS-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM-EABI-CLOUDABI: call void @mcount()
+// CHECK-ARM-EABI-CLOUDABI-NOT: call void @"\01__gnu_mcount_nc"()
+// CHECK-ARM64-EABI-CLOUDABI: call void @mcount()
+// CHECK-ARM64-EABI-CLOUDABI-NOT: call void @"\01__gnu_mcount_nc"()
+
diff --git a/test/Frontend/lit.local.cfg b/test/Frontend/lit.local.cfg
index c11fb6d611a58..7a05c5dfd2597 100644
--- a/test/Frontend/lit.local.cfg
+++ b/test/Frontend/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.c', '.cpp', '.m', '.mm', '.ll']
+config.suffixes = ['.c', '.cpp', '.m', '.mm', '.ll', '.cl']
diff --git a/test/Frontend/opencl.cl b/test/Frontend/opencl.cl
new file mode 100644
index 0000000000000..95b5f14716252
--- /dev/null
+++ b/test/Frontend/opencl.cl
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.1
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.2
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL2.0
+// RUN: %clang_cc1 %s -verify -fsyntax-only -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.1 -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -verify -fsyntax-only -cl-std=CL1.2 -fblocks -DBLOCKS
+// RUN: %clang_cc1 %s -triple amdgcn--amdhsa -x c -std=c99 -verify -fsyntax-only
+// RUN: %clang_cc1 -cl-std=CL1.1 -cl-strict-aliasing -fblocks %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION11 %s
+// RUN: %clang_cc1 -cl-std=CL1.2 -cl-strict-aliasing -fblocks %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION12 %s
+// RUN: %clang_cc1 -cl-std=CL2.0 -cl-strict-aliasing %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID-OPENCL-VERSION20 %s
+
+void f(void (^g)(void)) {
+#ifdef __OPENCL_C_VERSION__
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0 && !defined(BLOCKS)
+  // expected-error@-3{{blocks support disabled - compile with -fblocks or for OpenCL 2.0 or above}}
+#else
+  // expected-no-diagnostics
+#endif
+#else
+  // expected-error@-8{{blocks support disabled - compile with -fblocks or pick a deployment target that supports them}}
+#endif
+}
+
+// CHECK-INVALID-OPENCL-VERSION11: warning: OpenCL version 1.1 does not support the option '-cl-strict-aliasing'
+// CHECK-INVALID-OPENCL-VERSION12: warning: OpenCL version 1.2 does not support the option '-cl-strict-aliasing'
+// CHECK-INVALID-OPENCL-VERSION20: warning: OpenCL version 2.0 does not support the option '-cl-strict-aliasing'
diff --git a/test/Frontend/optimization-remark-analysis.c b/test/Frontend/optimization-remark-analysis.c
index 5b4d9aec41ae0..b396327976186 100644
--- a/test/Frontend/optimization-remark-analysis.c
+++ b/test/Frontend/optimization-remark-analysis.c
@@ -1,8 +1,8 @@
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -Rpass-analysis -S %s -o - 2>&1 | FileCheck %s --check-prefix=RPASS
 // RUN: %clang -O1 -fvectorize -target x86_64-unknown-unknown -emit-llvm -S %s -o - 2>&1 | FileCheck %s
 
-// RPASS: {{.*}}:21:1: remark: loop not vectorized: loop contains a switch statement
-// CHECK-NOT: {{.*}}:21:1: remark: loop not vectorized: loop contains a switch statement
+// RPASS: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
+// CHECK-NOT: {{.*}}:7:8: remark: loop not vectorized: loop contains a switch statement
 
 double foo(int N, int *Array) {
   double v = 0.0;
diff --git a/test/Frontend/optimization-remark-options.c b/test/Frontend/optimization-remark-options.c
index 74fbeaf5d0c17..a2d717a2422df 100644
--- a/test/Frontend/optimization-remark-options.c
+++ b/test/Frontend/optimization-remark-options.c
@@ -11,7 +11,7 @@ double foo(int N) {
   return v;
 }
 
-// CHECK: {{.*}}:18:13: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
+// CHECK: {{.*}}:17:3: remark: loop not vectorized: cannot prove it is safe to reorder memory operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop. If the arrays will always be independent specify '#pragma clang loop vectorize(assume_safety)' before the loop or provide the '__restrict__' qualifier with the independent array arguments. Erroneous results will occur if these options are incorrectly applied!
 
 void foo2(int *dw, int *uw, int *A, int *B, int *C, int *D, int N) {
   for (int i = 0; i < N; i++) {
diff --git a/test/Frontend/optimization-remark.c b/test/Frontend/optimization-remark.c
index e5bd75cbdb30b..72cad8ff608c4 100644
--- a/test/Frontend/optimization-remark.c
+++ b/test/Frontend/optimization-remark.c
@@ -27,9 +27,10 @@
 // CHECK: , !dbg !
 // CHECK-NOT: DW_TAG_base_type
 
-// But llvm.dbg.cu should be missing (to prevent writing debug info to
+// The CU should be marked NoDebug (to prevent writing debug info to
 // the final output).
-// CHECK-NOT: !llvm.dbg.cu = !{
+// CHECK: !llvm.dbg.cu = !{![[CU:.*]]}
+// CHECK: ![[CU]] = distinct !DICompileUnit({{.*}}emissionKind: NoDebug
 
 int foo(int x, int y) __attribute__((always_inline));
 int foo(int x, int y) { return x + y; }
diff --git a/test/Frontend/plugin-annotate-functions.c b/test/Frontend/plugin-annotate-functions.c
new file mode 100644
index 0000000000000..b8baf7ce77ee0
--- /dev/null
+++ b/test/Frontend/plugin-annotate-functions.c
@@ -0,0 +1,25 @@
+// RUN: %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -DPRAGMA_ON -S %s -o - | FileCheck %s --check-prefix=PRAGMA
+// RUN: %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -S %s -o - | FileCheck %s --check-prefix=NOPRAGMA
+// RUN: not %clang -fplugin=%llvmshlibdir/AnnotateFunctions%pluginext -emit-llvm -DBAD_PRAGMA -S %s -o - 2>&1 | FileCheck %s --check-prefix=BADPRAGMA
+// REQUIRES: plugins, examples
+
+#ifdef PRAGMA_ON
+#pragma enable_annotate
+#endif
+
+// BADPRAGMA: warning: extra tokens at end of #pragma directive
+#ifdef BAD_PRAGMA
+#pragma enable_annotate something
+#endif
+
+// PRAGMA: [[STR_VAR:@.+]] = private unnamed_addr constant [19 x i8] c"example_annotation\00"
+// PRAGMA: @llvm.global.annotations = {{.*}}@fn1{{.*}}[[STR_VAR]]{{.*}}@fn2{{.*}}[[STR_VAR]]
+// NOPRAGMA-NOT: [[STR_VAR:@.+]] = private unnamed_addr constant [19 x i8] c"example_annotation\00"
+// NOPRAGMA-NOT: @llvm.global.annotations = {{.*}}@fn1{{.*}}[[STR_VAR]]{{.*}}@fn2{{.*}}[[STR_VAR]]
+void fn1() { }
+void fn2() { }
+
+// BADPRAGMA: error: #pragma enable_annotate not allowed after declarations
+#ifdef BAD_PRAGMA
+#pragma enable_annotate
+#endif
diff --git a/test/Frontend/print-header-includes.c b/test/Frontend/print-header-includes.c
index 3f2b0696cd1ed..045c02b941bc8 100644
--- a/test/Frontend/print-header-includes.c
+++ b/test/Frontend/print-header-includes.c
@@ -1,24 +1,24 @@
-// RUN: cd %S
-// RUN: %clang_cc1 -include Inputs/test3.h -E -H -o %t.out %s 2> %t.stderr
+// RUN: %clang_cc1 -I%S -include Inputs/test3.h -E -H -o /dev/null %s 2> %t.stderr
 // RUN: FileCheck < %t.stderr %s
 
 // CHECK-NOT: test3.h
 // CHECK: . {{.*test.h}}
 // CHECK: .. {{.*test2.h}}
 
-// RUN: %clang_cc1 -include Inputs/test3.h -E --show-includes -o %t.out %s > %t.stdout
-// RUN: FileCheck --check-prefix=MS < %t.stdout %s
-// MS-NOT: test3.h
-// MS: Note: including file: {{.*test.h}}
-// MS: Note: including file:  {{.*test2.h}}
+// RUN: %clang_cc1 -I%S -include Inputs/test3.h -E --show-includes -o /dev/null %s | \
+// RUN:     FileCheck --strict-whitespace --check-prefix=MS %s
+// MS-NOT: <command line>
+// MS: Note: including file: {{[^ ]*test3.h}}
+// MS: Note: including file: {{[^ ]*test.h}}
+// MS: Note: including file:  {{[^ ]*test2.h}}
 // MS-NOT: Note
 
 // RUN: echo "fun:foo" > %t.blacklist
-// RUN: %clang_cc1 -fsanitize=address -fdepfile-entry=%t.blacklist -E --show-includes -o %t.out %s > %t.stdout
-// RUN: FileCheck --check-prefix=MS-BLACKLIST < %t.stdout %s
-// MS-BLACKLIST: Note: including file: {{.*\.blacklist}}
-// MS-BLACKLIST: Note: including file: {{.*test.h}}
-// MS-BLACKLIST: Note: including file:  {{.*test2.h}}
+// RUN: %clang_cc1 -I%S -fsanitize=address -fdepfile-entry=%t.blacklist -E --show-includes -o /dev/null %s | \
+// RUN:     FileCheck --strict-whitespace --check-prefix=MS-BLACKLIST %s
+// MS-BLACKLIST: Note: including file: {{[^ ]*\.blacklist}}
+// MS-BLACKLIST: Note: including file: {{[^ ]*test.h}}
+// MS-BLACKLIST: Note: including file:  {{[^ ]*test2.h}}
 // MS-BLACKLIST-NOT: Note
 
 #include "Inputs/test.h"
diff --git a/test/Frontend/profile-sample-use-loc-tracking.c b/test/Frontend/profile-sample-use-loc-tracking.c
index 31faad68d39e1..6d722c2985252 100644
--- a/test/Frontend/profile-sample-use-loc-tracking.c
+++ b/test/Frontend/profile-sample-use-loc-tracking.c
@@ -10,9 +10,10 @@
 // CHECK: , !dbg !
 // CHECK-NOT: DW_TAG_base_type
 
-// But llvm.dbg.cu should be missing (to prevent writing debug info to
+// The CU should be marked NoDebug (to prevent writing debug info to
 // the final output).
-// CHECK-NOT: !llvm.dbg.cu = !{
+// CHECK: !llvm.dbg.cu = !{![[CU:.*]]}
+// CHECK: ![[CU]] = distinct !DICompileUnit({{.*}}emissionKind: NoDebug
 
 int bar(int j) {
   return (j + j - 2) * (j - 2) * j;
diff --git a/test/Frontend/std.cl b/test/Frontend/std.cl
deleted file mode 100644
index b811b64dabcb8..0000000000000
--- a/test/Frontend/std.cl
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL1.1
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL1.2
-// RUN: %clang_cc1 %s -fsyntax-only -cl-std=CL2.0
-// RUN: not %clang_cc1 %s -fsyntax-only -cl-std=invalid -DINVALID 2>&1 | FileCheck %s
-
-#ifdef INVALID 
-// CHECK: invalid value 'invalid' in '-cl-std=invalid'
-#endif
diff --git a/test/Frontend/stdlang.c b/test/Frontend/stdlang.c
index 71997f1403c39..9c3c3078aaf2a 100644
--- a/test/Frontend/stdlang.c
+++ b/test/Frontend/stdlang.c
@@ -1,6 +1,17 @@
 // RUN: %clang_cc1 -x cuda -std=c++11 -DCUDA %s
-// RUN: %clang_cc1 -x cl -std=c99 -DOPENCL %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -x cl -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl1.1 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl1.2 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=cl2.0 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -DOPENCL %s
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 -DOPENCL %s
+// RUN: not %clang_cc1 -x cl -std=c99 -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-C99 %s
+// RUN: not %clang_cc1 -x cl -cl-std=invalid -DOPENCL %s 2>&1 | FileCheck --check-prefix=CHECK-INVALID %s
+// CHECK-C99: error: invalid argument '-std=c99' not allowed with 'OpenCL'
+// CHECK-INVALID: error: invalid value 'invalid' in '-cl-std=invalid'
 
 #if defined(CUDA)
   __attribute__((device)) void f_device();
diff --git a/test/Headers/cxx11.cpp b/test/Headers/cxx11.cpp
index 0b35a7c2bdfa1..324bd9958560a 100644
--- a/test/Headers/cxx11.cpp
+++ b/test/Headers/cxx11.cpp
@@ -1,6 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -ffreestanding -fsyntax-only -std=c++11 %s
 // RUN: %clang_cc1 -ffreestanding -fsyntax-only -std=c++11 -fmodules -fmodules-cache-path=%t %s
+// RUN: %clang_cc1 -ffreestanding -fsyntax-only -std=c++11 -fmodules -fmodules-cache-path=%t -fmodules-local-submodule-visibility %s
 
 // This test fails on systems with older OS X 10.9 SDK headers, see PR18322.
 
diff --git a/test/Headers/float.c b/test/Headers/float.c
new file mode 100644
index 0000000000000..46e9cc3fd9b03
--- /dev/null
+++ b/test/Headers/float.c
@@ -0,0 +1,219 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c89 -ffreestanding %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c99 -ffreestanding %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c11 -ffreestanding %s
+// expected-no-diagnostics
+
+/* Basic floating point conformance checks against:
+    - N1570 draft of C11 Std.
+    - N1256 draft of C99 Std.
+    - http://port70.net/~nsz/c/c89/c89-draft.html draft of C89/C90 Std.
+*/
+/*
+    C11,    5.2.4.2.2p11,   pp. 30
+    C99,    5.2.4.2.2p9,    pp. 25
+    C89,    2.2.4.2 
+*/
+#include <float.h>
+
+#ifndef FLT_RADIX
+    #error "Mandatory macro FLT_RADIX is missing."
+#elif   FLT_RADIX < 2
+    #error "Mandatory macro FLT_RADIX is invalid."
+#endif
+
+
+#ifndef FLT_MANT_DIG
+    #error "Mandatory macro FLT_MANT_DIG is missing."
+#elif   FLT_MANT_DIG < 2
+    #error "Mandatory macro FLT_MANT_DIG is invalid."
+#endif
+#ifndef DBL_MANT_DIG
+    #error "Mandatory macro DBL_MANT_DIG is missing."
+#elif   DBL_MANT_DIG < 2
+    #error "Mandatory macro DBL_MANT_DIG is invalid."
+#endif
+#ifndef LDBL_MANT_DIG
+    #error "Mandatory macro LDBL_MANT_DIG is missing."
+#elif   LDBL_MANT_DIG < 2
+    #error "Mandatory macro LDBL_MANT_DIG is invalid."
+#endif
+#if ((FLT_MANT_DIG > DBL_MANT_DIG) || (DBL_MANT_DIG > LDBL_MANT_DIG))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MANT_DIG are invalid."
+#endif
+
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+    #ifndef FLT_DECIMAL_DIG
+        #error "Mandatory macro FLT_DECIMAL_DIG is missing."
+    #elif   FLT_DECIMAL_DIG < 6
+        #error "Mandatory macro FLT_DECIMAL_DIG is invalid."
+    #endif
+    #ifndef DBL_DECIMAL_DIG
+        #error "Mandatory macro DBL_DECIMAL_DIG is missing."
+    #elif   DBL_DECIMAL_DIG < 10
+        #error "Mandatory macro DBL_DECIMAL_DIG is invalid."
+    #endif
+    #ifndef LDBL_DECIMAL_DIG
+        #error "Mandatory macro LDBL_DECIMAL_DIG is missing."
+    #elif   LDBL_DECIMAL_DIG < 10
+        #error "Mandatory macro LDBL_DECIMAL_DIG is invalid."
+    #endif
+    #if ((FLT_DECIMAL_DIG > DBL_DECIMAL_DIG) || (DBL_DECIMAL_DIG > LDBL_DECIMAL_DIG))
+        #error "Mandatory macros {FLT,DBL,LDBL}_DECIMAL_DIG are invalid."
+    #endif
+#else
+    #ifdef FLT_DECIMAL_DIG
+        #error "Macro FLT_DECIMAL_DIG should not be defined."
+    #endif
+    #ifdef DBL_DECIMAL_DIG
+        #error "Macro DBL_DECIMAL_DIG should not be defined."
+    #endif
+    #ifdef LDBL_DECIMAL_DIG
+        #error "Macro LDBL_DECIMAL_DIG should not be defined."
+    #endif
+#endif
+
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+    #ifndef DECIMAL_DIG
+        #error "Mandatory macro DECIMAL_DIG is missing."
+    #elif   DECIMAL_DIG < 10
+        #error "Mandatory macro DECIMAL_DIG is invalid."
+    #endif
+#else
+    #ifdef DECIMAL_DIG
+        #error "Macro DECIMAL_DIG should not be defined."
+    #endif
+#endif
+
+
+#ifndef FLT_DIG
+    #error "Mandatory macro FLT_DIG is missing."
+#elif   FLT_DIG < 6
+    #error "Mandatory macro FLT_DIG is invalid."
+#endif
+#ifndef DBL_DIG
+    #error "Mandatory macro DBL_DIG is missing."
+#elif   DBL_DIG < 10
+    #error "Mandatory macro DBL_DIG is invalid."
+#endif
+#ifndef LDBL_DIG
+    #error "Mandatory macro LDBL_DIG is missing."
+#elif   LDBL_DIG < 10
+    #error "Mandatory macro LDBL_DIG is invalid."
+#endif
+#if ((FLT_DIG > DBL_DIG) || (DBL_DIG > LDBL_DIG))
+    #error "Mandatory macros {FLT,DBL,LDBL}_DIG, are invalid."
+#endif
+
+
+#ifndef FLT_MIN_EXP
+    #error "Mandatory macro FLT_MIN_EXP is missing."
+#elif   FLT_MIN_EXP > -1
+    #error "Mandatory macro FLT_MIN_EXP is invalid."
+#endif
+#ifndef DBL_MIN_EXP
+    #error "Mandatory macro DBL_MIN_EXP is missing."
+#elif   DBL_MIN_EXP > -1
+    #error "Mandatory macro DBL_MIN_EXP is invalid."
+#endif
+#ifndef LDBL_MIN_EXP
+    #error "Mandatory macro LDBL_MIN_EXP is missing."
+#elif   LDBL_MIN_EXP > -1
+    #error "Mandatory macro LDBL_MIN_EXP is invalid."
+#endif
+
+
+#ifndef FLT_MIN_10_EXP
+    #error "Mandatory macro FLT_MIN_10_EXP is missing."
+#elif   FLT_MIN_10_EXP > -37
+    #error "Mandatory macro FLT_MIN_10_EXP is invalid."
+#endif
+#ifndef DBL_MIN_10_EXP
+    #error "Mandatory macro DBL_MIN_10_EXP is missing."
+#elif   DBL_MIN_10_EXP > -37
+    #error "Mandatory macro DBL_MIN_10_EXP is invalid."
+#endif
+#ifndef LDBL_MIN_10_EXP
+    #error "Mandatory macro LDBL_MIN_10_EXP is missing."
+#elif   LDBL_MIN_10_EXP > -37
+    #error "Mandatory macro LDBL_MIN_10_EXP is invalid."
+#endif
+
+
+#ifndef FLT_MAX_EXP
+    #error "Mandatory macro FLT_MAX_EXP is missing."
+#elif   FLT_MAX_EXP < 1
+    #error "Mandatory macro FLT_MAX_EXP is invalid."
+#endif
+#ifndef DBL_MAX_EXP
+    #error "Mandatory macro DBL_MAX_EXP is missing."
+#elif   DBL_MAX_EXP < 1
+    #error "Mandatory macro DBL_MAX_EXP is invalid."
+#endif
+#ifndef LDBL_MAX_EXP
+    #error "Mandatory macro LDBL_MAX_EXP is missing."
+#elif   LDBL_MAX_EXP < 1
+    #error "Mandatory macro LDBL_MAX_EXP is invalid."
+#endif
+#if ((FLT_MAX_EXP > DBL_MAX_EXP) || (DBL_MAX_EXP > LDBL_MAX_EXP))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MAX_EXP are invalid."
+#endif
+
+
+#ifndef FLT_MAX_10_EXP
+    #error "Mandatory macro FLT_MAX_10_EXP is missing."
+#elif   FLT_MAX_10_EXP < 37
+    #error "Mandatory macro FLT_MAX_10_EXP is invalid."
+#endif
+#ifndef DBL_MAX_10_EXP
+    #error "Mandatory macro DBL_MAX_10_EXP is missing."
+#elif   DBL_MAX_10_EXP < 37
+    #error "Mandatory macro DBL_MAX_10_EXP is invalid."
+#endif
+#ifndef LDBL_MAX_10_EXP
+    #error "Mandatory macro LDBL_MAX_10_EXP is missing."
+#elif   LDBL_MAX_10_EXP < 37
+    #error "Mandatory macro LDBL_MAX_10_EXP is invalid."
+#endif
+#if ((FLT_MAX_10_EXP > DBL_MAX_10_EXP) || (DBL_MAX_10_EXP > LDBL_MAX_10_EXP))
+    #error "Mandatory macros {FLT,DBL,LDBL}_MAX_10_EXP are invalid."
+#endif
+
+
+/* Internal consistency checks */
+_Static_assert(FLT_RADIX == __FLT_RADIX__, "");
+
+_Static_assert(FLT_MANT_DIG == __FLT_MANT_DIG__, "");
+_Static_assert(DBL_MANT_DIG == __DBL_MANT_DIG__, "");
+_Static_assert(LDBL_MANT_DIG == __LDBL_MANT_DIG__, "");
+
+#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__)
+_Static_assert(FLT_DECIMAL_DIG == __FLT_DECIMAL_DIG__, "");
+_Static_assert(DBL_DECIMAL_DIG == __DBL_DECIMAL_DIG__, "");
+_Static_assert(LDBL_DECIMAL_DIG == __LDBL_DECIMAL_DIG__, "");
+#endif
+
+#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__)
+_Static_assert(DECIMAL_DIG == __DECIMAL_DIG__, "");
+#endif
+
+_Static_assert(FLT_DIG == __FLT_DIG__, "");
+_Static_assert(DBL_DIG == __DBL_DIG__, "");
+_Static_assert(LDBL_DIG == __LDBL_DIG__, "");
+
+_Static_assert(FLT_MIN_EXP == __FLT_MIN_EXP__, "");
+_Static_assert(DBL_MIN_EXP == __DBL_MIN_EXP__, "");
+_Static_assert(LDBL_MIN_EXP == __LDBL_MIN_EXP__, "");
+
+_Static_assert(FLT_MIN_10_EXP == __FLT_MIN_10_EXP__, "");
+_Static_assert(DBL_MIN_10_EXP == __DBL_MIN_10_EXP__, "");
+_Static_assert(LDBL_MIN_10_EXP == __LDBL_MIN_10_EXP__, "");
+
+_Static_assert(FLT_MAX_EXP == __FLT_MAX_EXP__, "");
+_Static_assert(DBL_MAX_EXP == __DBL_MAX_EXP__, "");
+_Static_assert(LDBL_MAX_EXP == __LDBL_MAX_EXP__, "");
+
+_Static_assert(FLT_MAX_10_EXP == __FLT_MAX_10_EXP__, "");
+_Static_assert(DBL_MAX_10_EXP == __DBL_MAX_10_EXP__, "");
+_Static_assert(LDBL_MAX_10_EXP == __LDBL_MAX_10_EXP__, "");
diff --git a/test/Headers/ms-intrin.cpp b/test/Headers/ms-intrin.cpp
index 9356d21a23686..c697428553818 100644
--- a/test/Headers/ms-intrin.cpp
+++ b/test/Headers/ms-intrin.cpp
@@ -20,11 +20,11 @@
 
 // REQUIRES: x86-registered-target
 
-// Intrin.h needs size_t, but -ffreestanding prevents us from getting it from
+// intrin.h needs size_t, but -ffreestanding prevents us from getting it from
 // stddef.h.  Work around it with this typedef.
 typedef __SIZE_TYPE__ size_t;
 
-#include <Intrin.h>
+#include <intrin.h>
 
 // Use some C++ to make sure we closed the extern "C" brackets.
 template <typename T>
diff --git a/test/Headers/opencl-c-header.cl b/test/Headers/opencl-c-header.cl
new file mode 100644
index 0000000000000..37239357bc090
--- /dev/null
+++ b/test/Headers/opencl-c-header.cl
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple spir64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-amdhsa -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple ppc64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.1| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -emit-llvm -o - %s -cl-std=CL1.2| FileCheck %s
+// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -internal-isystem ../../lib/Headers -include opencl-c.h -fblocks -emit-llvm -o - %s -cl-std=CL2.0| FileCheck --check-prefix=CHECK20 %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -finclude-default-header -emit-llvm -o - %s | FileCheck %s
+// CHECK: _Z16convert_char_rtec
+// CHECK-NOT: _Z3ctzc
+// CHECK20: _Z3ctzc
+// CHECK20-NOT: _Z16convert_char_rtec
+// CHECK-MOD: Reading modules
+
+// Test including the default header as a module.
+// The module should be compiled only once and loaded from cache afterwards.
+// Change the directory mode to read only to make sure no new modules are created.
+// Check time report to make sure module is used.
+
+// ===
+// Clear current directory.
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+
+// ===
+// Compile for OpenCL 1.0 for the first time. A module should be generated.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: chmod u-w %t/opencl_c.pcm
+
+// ===
+// Compile for OpenCL 1.0 for the second time. The module should not be re-created.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: chmod u+w %t/opencl_c.pcm
+// RUN: mv %t/opencl_c.pcm %t/1_0.pcm
+
+// ===
+// Compile for OpenCL 2.0 for the first time. The module should change.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: not diff %t/1_0.pcm %t/opencl_c.pcm
+// RUN: chmod u-w %t/opencl_c.pcm
+
+// ===
+// Compile for OpenCL 2.0 for the second time. The module should not change.
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -fdisable-module-hash -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+
+// Check cached module works for different OpenCL versions.
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple amdgcn--amdhsa -emit-llvm -o - -cl-std=CL2.0  -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: chmod u-w %t 
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: %clang_cc1 -triple amdgcn--amdhsa -emit-llvm -o - -cl-std=CL2.0 -finclude-default-header -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -ftime-report %s 2>&1 | FileCheck --check-prefix=CHECK20 --check-prefix=CHECK-MOD %s
+// RUN: chmod u+w %t
+
+char f(char x) {
+#if __OPENCL_C_VERSION__ != CL_VERSION_2_0
+  return convert_char_rte(x);
+#ifdef NO_HEADER
+  //expected-warning@-2{{implicit declaration of function 'convert_char_rte' is invalid in C99}}
+#endif //NO_HEADER
+
+#else //__OPENCL_C_VERSION__
+  return ctz(x);
+#endif //__OPENCL_C_VERSION__
+}
diff --git a/test/Headers/x86intrin-2.c b/test/Headers/x86intrin-2.c
index f98fdbd13a8a4..9be8545a2cd62 100644
--- a/test/Headers/x86intrin-2.c
+++ b/test/Headers/x86intrin-2.c
@@ -72,7 +72,7 @@ __mmask8 __attribute__((__target__("avx512vl"))) mm_cmpeq_epi32_mask_wrap(__m128
   return _mm_cmpeq_epi32_mask(a, b);
 }
 
-__v64qi __attribute__((__target__("avx512bw"))) mm512_setzero_qi_wrap(void) {
+__m512i __attribute__((__target__("avx512bw"))) mm512_setzero_qi_wrap(void) {
   return _mm512_setzero_qi();
 }
 
diff --git a/test/Headers/xmmintrin.c b/test/Headers/xmmintrin.c
index 39743c96b783c..c617504b85da5 100644
--- a/test/Headers/xmmintrin.c
+++ b/test/Headers/xmmintrin.c
@@ -7,6 +7,9 @@
 // REQUIRES: x86-registered-target
 #include <xmmintrin.h>
 
+// CHECK: @c = common global i8 0, align 16
+_MM_ALIGN16 char c;
+
 // Make sure the last step of _mm_cvtps_pi16 converts <4 x i32> to <4 x i16> by
 // checking that clang emits PACKSSDW instead of PACKSSWB.
 
@@ -23,3 +26,7 @@ __m128 test_xmmintrin_provides_emmintrin(__m128d __a, __m128d __b) {
   return _mm_add_sd(__a, __b);
 }
 
+#if __STDC_HOSTED__
+// Make sure stdlib.h symbols are accessible.
+void *p = NULL;
+#endif
diff --git a/test/Index/Core/Inputs/module/ModA.h b/test/Index/Core/Inputs/module/ModA.h
new file mode 100644
index 0000000000000..081d86cc4516c
--- /dev/null
+++ b/test/Index/Core/Inputs/module/ModA.h
@@ -0,0 +1,2 @@
+
+void ModA_func(void);
diff --git a/test/Index/Core/Inputs/module/module.modulemap b/test/Index/Core/Inputs/module/module.modulemap
new file mode 100644
index 0000000000000..a132562eafd8b
--- /dev/null
+++ b/test/Index/Core/Inputs/module/module.modulemap
@@ -0,0 +1 @@
+module ModA { header "ModA.h" export * }
diff --git a/test/Index/Core/index-source.cpp b/test/Index/Core/index-source.cpp
new file mode 100644
index 0000000000000..75c6396da553c
--- /dev/null
+++ b/test/Index/Core/index-source.cpp
@@ -0,0 +1,9 @@
+// RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
+
+template <typename TemplArg>
+class TemplCls {
+// CHECK: [[@LINE-1]]:7 | class/C++ | TemplCls | c:@ST>1#T@TemplCls | <no-cgname> | Def | rel: 0
+  TemplCls(int x);
+  // CHECK: [[@LINE-1]]:3 | constructor/C++ | TemplCls | c:@ST>1#T@TemplCls@F@TemplCls#I# | <no-cgname> | Decl,RelChild | rel: 1
+  // CHECK-NEXT: RelChild | TemplCls | c:@ST>1#T@TemplCls
+};
diff --git a/test/Index/Core/index-source.m b/test/Index/Core/index-source.m
new file mode 100644
index 0000000000000..2e9c01c02d481
--- /dev/null
+++ b/test/Index/Core/index-source.m
@@ -0,0 +1,107 @@
+// RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
+
+@interface Base
+// CHECK: [[@LINE-1]]:12 | class/ObjC | Base | c:objc(cs)Base | _OBJC_CLASS_$_Base | Decl | rel: 0
+-(void)meth;
+// CHECK: [[@LINE-1]]:1 | instance-method/ObjC | meth | c:objc(cs)Base(im)meth | -[Base meth] | Decl,Dyn,RelChild | rel: 1
+// CHECK-NEXT: RelChild | Base | c:objc(cs)Base
+@end
+
+void foo();
+// CHECK: [[@LINE+1]]:6 | function/C | goo | c:@F@goo | _goo | Def | rel: 0
+void goo(Base *b) {
+  // CHECK: [[@LINE+2]]:3 | function/C | foo | c:@F@foo | _foo | Ref,Call,RelCall | rel: 1
+  // CHECK-NEXT: RelCall | goo | c:@F@goo
+  foo();
+  // CHECK: [[@LINE+3]]:6 | instance-method/ObjC | meth | c:objc(cs)Base(im)meth | -[Base meth] | Ref,Call,Dyn,RelRec,RelCall | rel: 2
+  // CHECK-NEXT: RelCall | goo | c:@F@goo
+  // CHECK-NEXT: RelRec | Base | c:objc(cs)Base
+  [b meth];
+}
+
+// CHECK: [[@LINE+1]]:11 | protocol/ObjC | Prot1 | c:objc(pl)Prot1 | <no-cgname> | Decl | rel: 0
+@protocol Prot1
+@end
+
+// CHECK: [[@LINE+3]]:11 | protocol/ObjC | Prot2 | c:objc(pl)Prot2 | <no-cgname> | Decl | rel: 0
+// CHECK: [[@LINE+2]]:17 | protocol/ObjC | Prot1 | c:objc(pl)Prot1 | <no-cgname> | Ref,RelBase | rel: 1
+// CHECK-NEXT: RelBase | Prot2 | c:objc(pl)Prot2
+@protocol Prot2<Prot1>
+@end
+
+// CHECK: [[@LINE+7]]:12 | class/ObjC | Sub | c:objc(cs)Sub | _OBJC_CLASS_$_Sub | Decl | rel: 0
+// CHECK: [[@LINE+6]]:18 | class/ObjC | Base | c:objc(cs)Base | _OBJC_CLASS_$_Base | Ref,RelBase | rel: 1
+// CHECK-NEXT: RelBase | Sub | c:objc(cs)Sub
+// CHECK: [[@LINE+4]]:23 | protocol/ObjC | Prot2 | c:objc(pl)Prot2 | <no-cgname> | Ref,RelBase | rel: 1
+// CHECK-NEXT: RelBase | Sub | c:objc(cs)Sub
+// CHECK: [[@LINE+2]]:30 | protocol/ObjC | Prot1 | c:objc(pl)Prot1 | <no-cgname> | Ref,RelBase | rel: 1
+// CHECK-NEXT: RelBase | Sub | c:objc(cs)Sub
+@interface Sub : Base<Prot2, Prot1>
+@end
+
+@interface NSArray<ObjectType> : Base
+// CHECK-NOT: ObjectType
+-(ObjectType)getit;
+@end
+
+// CHECK: [[@LINE+1]]:6 | function/C | over_func | c:@F@over_func#I# | __Z9over_funci | Decl | rel: 0
+void over_func(int x) __attribute__((overloadable));
+// CHECK: [[@LINE+1]]:6 | function/C | over_func | c:@F@over_func#f# | __Z9over_funcf | Decl | rel: 0
+void over_func(float x) __attribute__((overloadable));
+
+// CHECK: [[@LINE+1]]:6 | enum/C | MyEnum | c:@E@MyEnum | <no-cgname> | Def | rel: 0
+enum MyEnum {
+  // CHECK: [[@LINE+2]]:3 | enumerator/C | EnumeratorInNamed | c:@E@MyEnum@EnumeratorInNamed | <no-cgname> | Def,RelChild | rel: 1
+  // CHECK-NEXT: RelChild | MyEnum | c:@E@MyEnum
+  EnumeratorInNamed
+};
+
+// CHECK: [[@LINE+1]]:1 | enum/C | <no-name> | c:@Ea@One | <no-cgname> | Def | rel: 0
+enum {
+  // CHECK: [[@LINE+2]]:3 | enumerator/C | One | c:@Ea@One@One | <no-cgname> | Def,RelChild | rel: 1
+  // CHECK-NEXT: RelChild | <no-name> | c:@Ea@One
+  One,
+  // CHECK: [[@LINE+2]]:3 | enumerator/C | Two | c:@Ea@One@Two | <no-cgname> | Def,RelChild | rel: 1
+  // CHECK-NEXT: RelChild | <no-name> | c:@Ea@One
+  Two,
+};
+
+// CHECK: [[@LINE+1]]:13 | type-alias/C | jmp_buf | c:index-source.m@T@jmp_buf | <no-cgname> | Def | rel: 0
+typedef int jmp_buf[(18)];
+// CHECK: [[@LINE+2]]:12 | function/C | setjmp | c:@F@setjmp | _setjmp | Decl | rel: 0
+// CHECK: [[@LINE+1]]:19 | type-alias/C | jmp_buf | c:index-source.m@T@jmp_buf | <no-cgname> | Ref | rel: 0
+extern int setjmp(jmp_buf);
+
+@class I1;
+@interface I1
+// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | meth | c:objc(cs)I1(im)meth | -[I1 meth] | Decl,Dyn,RelChild | rel: 1
+-(void)meth;
+@end
+
+@interface I2
+@property (readwrite) id prop;
+@end
+
+// CHECK: [[@LINE+2]]:17 | field/ObjC | _prop | c:objc(cs)I2@_prop | <no-cgname> | Def,Impl,RelChild | rel: 1
+// CHECK-NEXT: RelChild | I2 | c:objc(cs)I2
+@implementation I2
+// CHECK: [[@LINE+5]]:13 | instance-property/ObjC | prop | c:objc(cs)I2(py)prop | <no-cgname> | Ref | rel: 0
+// CHECK: [[@LINE+4]]:13 | instance-method/ObjC | prop | c:objc(cs)I2(im)prop | -[I2 prop] | Def,RelChild | rel: 1
+// CHECK-NEXT: RelChild | I2 | c:objc(cs)I2
+// CHECK: [[@LINE+2]]:13 | instance-method/ObjC | setProp: | c:objc(cs)I2(im)setProp: | -[I2 setProp:] | Def,RelChild | rel: 1
+// CHECK-NEXT: RelChild | I2 | c:objc(cs)I2
+@synthesize prop = _prop;
+@end
+
+@interface I3
+@property (readwrite) id prop;
+-(id)prop;
+-(void)setProp:(id)p;
+@end
+
+@implementation I3
+// CHECK: [[@LINE+3]]:13 | instance-property/ObjC | prop | c:objc(cs)I3(py)prop | <no-cgname> | Ref | rel: 0
+// CHECK: [[@LINE+2]]:13 | instance-method/ObjC | prop | c:objc(cs)I3(im)prop | -[I3 prop] | Def,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:13 | instance-method/ObjC | setProp: | c:objc(cs)I3(im)setProp: | -[I3 setProp:] | Def,RelChild | rel: 1
+@synthesize prop = _prop;
+@end
diff --git a/test/Index/Core/index-source.mm b/test/Index/Core/index-source.mm
new file mode 100644
index 0000000000000..049a0bdaf6474
--- /dev/null
+++ b/test/Index/Core/index-source.mm
@@ -0,0 +1,11 @@
+// RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
+
+@interface MyCls
+@end
+
+@protocol P1,P2;
+
+// CHECK: [[@LINE+1]]:6 | function/C | foo | c:@F@foo#*$objc(cs)MyCls# | __Z3fooP5MyCls | Decl | rel: 0
+void foo(MyCls *o);
+// CHECK: [[@LINE+1]]:6 | function/C | foo | c:@F@foo#*Qoobjc(pl)P1objc(pl)P2# | __Z3fooPU15objcproto2P12P211objc_object | Decl | rel: 0
+void foo(id<P2, P1> o);
diff --git a/test/Index/Core/index-subkinds.m b/test/Index/Core/index-subkinds.m
new file mode 100644
index 0000000000000..e668d84ffa89c
--- /dev/null
+++ b/test/Index/Core/index-subkinds.m
@@ -0,0 +1,48 @@
+// RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
+
+// CHECK: [[@LINE+1]]:12 | class/ObjC | XCTestCase | c:objc(cs)XCTestCase | _OBJC_CLASS_$_XCTestCase | Decl | rel: 0
+@interface XCTestCase
+@end
+
+// CHECK: [[@LINE+1]]:12 | class(test)/ObjC | MyTestCase | c:objc(cs)MyTestCase | _OBJC_CLASS_$_MyTestCase | Decl | rel: 0
+@interface MyTestCase : XCTestCase
+@end
+// CHECK: [[@LINE+1]]:17 | class(test)/ObjC | MyTestCase | c:objc(cs)MyTestCase | <no-cgname> | Def | rel: 0
+@implementation MyTestCase
+// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testMe | c:objc(cs)MyTestCase(im)testMe | -[MyTestCase testMe] | Def,Dyn,RelChild | rel: 1
+-(void)testMe {}
+// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | testResult | c:objc(cs)MyTestCase(im)testResult | -[MyTestCase testResult] | Def,Dyn,RelChild | rel: 1
+-(id)testResult { return 0; }
+// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | testWithInt: | c:objc(cs)MyTestCase(im)testWithInt: | -[MyTestCase testWithInt:] | Def,Dyn,RelChild | rel: 1
+-(void)testWithInt:(int)i {}
+@end
+
+// CHECK: [[@LINE+1]]:12 | class(test)/ObjC | SubTestCase | c:objc(cs)SubTestCase | _OBJC_CLASS_$_SubTestCase | Decl | rel: 0
+@interface SubTestCase : MyTestCase
+@end
+// CHECK: [[@LINE+1]]:17 | class(test)/ObjC | SubTestCase | c:objc(cs)SubTestCase | <no-cgname> | Def | rel: 0
+@implementation SubTestCase
+// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testIt2 | c:objc(cs)SubTestCase(im)testIt2 | -[SubTestCase testIt2] | Def,Dyn,RelChild | rel: 1
+-(void)testIt2 {}
+@end
+
+// CHECK: [[@LINE+1]]:12 | extension/ObjC | cat | c:objc(cy)MyTestCase@cat | <no-cgname> | Decl | rel: 0
+@interface MyTestCase(cat)
+@end
+// CHECK: [[@LINE+1]]:17 | extension/ObjC | MyTestCase | c:objc(cy)MyTestCase@cat | <no-cgname> | Def | rel: 0
+@implementation MyTestCase(cat)
+// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testInCat | c:objc(cs)MyTestCase(im)testInCat | -[MyTestCase(cat) testInCat] | Def,Dyn,RelChild | rel: 1
+- (void)testInCat {}
+@end
+
+
+@class NSButton;
+@interface IBCls
+// CHECK: [[@LINE+2]]:34 | instance-method/ObjC | prop | c:objc(cs)IBCls(im)prop | -[IBCls prop] | Decl,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:34 | instance-property(IB)/ObjC | prop | c:objc(cs)IBCls(py)prop | <no-cgname> | Decl,RelChild | rel: 1
+@property (readonly) IBOutlet id prop;
+// CHECK: [[@LINE+1]]:54 | instance-property(IB,IBColl)/ObjC | propColl | c:objc(cs)IBCls(py)propColl | <no-cgname> | Decl,RelChild | rel: 1
+@property (readonly) IBOutletCollection(NSButton) id propColl;
+// CHECK: [[@LINE+1]]:1 | instance-method(IB)/ObjC | doIt | c:objc(cs)IBCls(im)doIt | -[IBCls doIt] | Decl,Dyn,RelChild | rel: 1
+-(IBAction)doIt;
+@end
diff --git a/test/Index/Core/index-with-module.m b/test/Index/Core/index-with-module.m
new file mode 100644
index 0000000000000..03fd5cd5b8d31
--- /dev/null
+++ b/test/Index/Core/index-with-module.m
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t.mcp
+// RUN: c-index-test core -print-source-symbols -- %s -I %S/Inputs/module -fmodules -fmodules-cache-path=%t.mcp | FileCheck %s
+
+// CHECK: [[@LINE+1]]:9 | module/C | ModA | Decl |
+@import ModA;
+// CHECK: [[@LINE+1]]:1 | module/C | ModA | Decl,Impl |
+#include "ModA.h"
+
+void foo() {
+  // CHECK: [[@LINE+1]]:3 | function/C | ModA_func | c:@F@ModA_func | {{.*}} | Ref,Call,RelCall | rel: 1
+  ModA_func();
+}
diff --git a/test/Index/Inputs/module.map b/test/Index/Inputs/module.map
index 4bfc109a8b136..10712accb1c24 100644
--- a/test/Index/Inputs/module.map
+++ b/test/Index/Inputs/module.map
@@ -6,3 +6,17 @@ module ModuleNeedsVFS {
 framework module * { }
 
 module ModuleUndef { header "module-undef.h" }
+
+module PreambleWithImplicitImport {
+  module A {
+    header "preamble-with-implicit-import-A.h"
+  }
+  module B {
+    header "preamble-with-implicit-import-B.h"
+    export *
+  }
+  module C {
+    header "preamble-with-implicit-import-C.h"
+    export *
+  }
+}
diff --git a/test/Index/Inputs/preamble-with-implicit-import-A.h b/test/Index/Inputs/preamble-with-implicit-import-A.h
new file mode 100644
index 0000000000000..c68390159094a
--- /dev/null
+++ b/test/Index/Inputs/preamble-with-implicit-import-A.h
@@ -0,0 +1 @@
+// preamble-with-implicit-import-A
diff --git a/test/Index/Inputs/preamble-with-implicit-import-B.h b/test/Index/Inputs/preamble-with-implicit-import-B.h
new file mode 100644
index 0000000000000..17c138dfb5aa4
--- /dev/null
+++ b/test/Index/Inputs/preamble-with-implicit-import-B.h
@@ -0,0 +1,3 @@
+#pragma once
+#include "preamble-with-implicit-import-C.h" // Circular
+typedef struct { char x; } Typo;
diff --git a/test/Index/Inputs/preamble-with-implicit-import-C.h b/test/Index/Inputs/preamble-with-implicit-import-C.h
new file mode 100644
index 0000000000000..a3fc1d4fea06c
--- /dev/null
+++ b/test/Index/Inputs/preamble-with-implicit-import-C.h
@@ -0,0 +1,2 @@
+#pragma once
+#include "preamble-with-implicit-import-B.h" // Circular
diff --git a/test/Index/Inputs/preamble-with-implicit-import.h b/test/Index/Inputs/preamble-with-implicit-import.h
new file mode 100644
index 0000000000000..1b429678f2135
--- /dev/null
+++ b/test/Index/Inputs/preamble-with-implicit-import.h
@@ -0,0 +1,4 @@
+#include "preamble-with-implicit-import-A.h"
+
+// Typo is defined in B, which is not imported.
+void useTypeFromB(Typo *);
diff --git a/test/Index/annotate-comments-availability-attrs.cpp b/test/Index/annotate-comments-availability-attrs.cpp
index 74a57b9beeb20..f31c4e16ccd63 100644
--- a/test/Index/annotate-comments-availability-attrs.cpp
+++ b/test/Index/annotate-comments-availability-attrs.cpp
@@ -13,12 +13,12 @@
 void attr_availability_1() __attribute__((availability(macosx,obsoleted=10.0,introduced=8.0,deprecated=9.0, message="use availability_test in <foo.h>")))
                            __attribute__((availability(ios,unavailable, message="not for iOS")));
 
-// CHECK: FullCommentAsXML=[<Function file="{{[^"]+}}annotate-comments-availability-attrs.cpp" line="[[@LINE-3]]" column="6"><Name>attr_availability_1</Name><USR>c:@F@attr_availability_1#</USR><Declaration>void attr_availability_1()</Declaration><Abstract><Para> Aaa.</Para></Abstract><Availability distribution="iOS"><DeprecationSummary>not for iOS</DeprecationSummary><Unavailable/></Availability><Availability distribution="OS X"><IntroducedInVersion>8.0</IntroducedInVersion><DeprecatedInVersion>9.0</DeprecatedInVersion><RemovedAfterVersion>10.0</RemovedAfterVersion><DeprecationSummary>use availability_test in &lt;foo.h&gt;</DeprecationSummary></Availability></Function>]
+// CHECK: FullCommentAsXML=[<Function file="{{[^"]+}}annotate-comments-availability-attrs.cpp" line="[[@LINE-3]]" column="6"><Name>attr_availability_1</Name><USR>c:@F@attr_availability_1#</USR><Declaration>void attr_availability_1()</Declaration><Abstract><Para> Aaa.</Para></Abstract><Availability distribution="iOS"><DeprecationSummary>not for iOS</DeprecationSummary><Unavailable/></Availability><Availability distribution="macOS"><IntroducedInVersion>8.0</IntroducedInVersion><DeprecatedInVersion>9.0</DeprecatedInVersion><RemovedAfterVersion>10.0</RemovedAfterVersion><DeprecationSummary>use availability_test in &lt;foo.h&gt;</DeprecationSummary></Availability></Function>]
 
 /// Aaa.
 void attr_availability_2() __attribute__((availability(macosx,obsoleted=10.0.1,introduced=8.0.1,deprecated=9.0.1)));
 
-// CHECK: FullCommentAsXML=[<Function file="{{[^"]+}}annotate-comments-availability-attrs.cpp" line="[[@LINE-2]]" column="6"><Name>attr_availability_2</Name><USR>c:@F@attr_availability_2#</USR><Declaration>void attr_availability_2()</Declaration><Abstract><Para> Aaa.</Para></Abstract><Availability distribution="OS X"><IntroducedInVersion>8.0.1</IntroducedInVersion><DeprecatedInVersion>9.0.1</DeprecatedInVersion><RemovedAfterVersion>10.0.1</RemovedAfterVersion></Availability></Function>]
+// CHECK: FullCommentAsXML=[<Function file="{{[^"]+}}annotate-comments-availability-attrs.cpp" line="[[@LINE-2]]" column="6"><Name>attr_availability_2</Name><USR>c:@F@attr_availability_2#</USR><Declaration>void attr_availability_2()</Declaration><Abstract><Para> Aaa.</Para></Abstract><Availability distribution="macOS"><IntroducedInVersion>8.0.1</IntroducedInVersion><DeprecatedInVersion>9.0.1</DeprecatedInVersion><RemovedAfterVersion>10.0.1</RemovedAfterVersion></Availability></Function>]
 
 /// Aaa.
 void attr_deprecated_1() __attribute__((deprecated));
diff --git a/test/Index/annotate-tokens.c b/test/Index/annotate-tokens.c
index 2f95ca6f458c9..c72e4f725fcce 100644
--- a/test/Index/annotate-tokens.c
+++ b/test/Index/annotate-tokens.c
@@ -80,10 +80,10 @@ void test() {
 // CHECK: Punctuation: "(" [5:3 - 5:4] CStyleCastExpr=
 // CHECK: Keyword: "void" [5:4 - 5:8] CStyleCastExpr=
 // CHECK: Punctuation: ")" [5:8 - 5:9] CStyleCastExpr=
-// CHECK: Keyword: "sizeof" [5:9 - 5:15] UnexposedExpr=
-// CHECK: Punctuation: "(" [5:15 - 5:16] UnexposedExpr=
+// CHECK: Keyword: "sizeof" [5:9 - 5:15] UnaryExpr=
+// CHECK: Punctuation: "(" [5:15 - 5:16] UnaryExpr=
 // CHECK: Identifier: "T" [5:16 - 5:17] TypeRef=T:1:13
-// CHECK: Punctuation: ")" [5:17 - 5:18] UnexposedExpr=
+// CHECK: Punctuation: ")" [5:17 - 5:18] UnaryExpr=
 // CHECK: Punctuation: ";" [5:18 - 5:19] CompoundStmt=
 // CHECK: Keyword: "struct" [7:3 - 7:9] VarDecl=x:7:12 (Definition)
 // CHECK: Identifier: "X" [7:10 - 7:11] TypeRef=struct X:2:8
diff --git a/test/Index/availability.c b/test/Index/availability.c
index cfe0ff5bea81b..b9d2c6f449c1e 100644
--- a/test/Index/availability.c
+++ b/test/Index/availability.c
@@ -14,7 +14,7 @@ enum {
 // RUN: FileCheck -check-prefix=CHECK-1 %s < %t
 // RUN: FileCheck -check-prefix=CHECK-2 %s < %t
 // CHECK-1: (ios, introduced=3.2, deprecated=4.1) 
-// CHECK-2: (macosx, introduced=10.4, deprecated=10.5, obsoleted=10.7)
+// CHECK-2: (macos, introduced=10.4, deprecated=10.5, obsoleted=10.7)
 
 // CHECK-2: EnumConstantDecl=old_enum:6:3 (Definition) (deprecated)
-// CHECK-2: EnumConstantDecl=old_enum_plat:10:3 {{.*}} (macosx, introduced=10.4, deprecated=10.5, obsoleted=10.7)
+// CHECK-2: EnumConstantDecl=old_enum_plat:10:3 {{.*}} (macos, introduced=10.4, deprecated=10.5, obsoleted=10.7)
diff --git a/test/Index/availability.cpp b/test/Index/availability.cpp
index d6f90385f0bc8..d8cd3bfb65414 100644
--- a/test/Index/availability.cpp
+++ b/test/Index/availability.cpp
@@ -10,4 +10,4 @@ struct Foo {
 // CHECK: FunctionDecl=foo:1:6 (unavailable) [type=void ()] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
 // CHECK: StructDecl=Foo:3:8 (Definition) [type=Foo] [typekind=Record] [isPOD=1]
 // CHECK: CXXMethod=foo:4:7 (unavailable) [type=int (){{.*}}] [typekind=FunctionProto] [resulttype=int] [resulttypekind=Int] [isPOD=0]
-// CHECK: CXXConstructor=Foo:5:3 (unavailable) [type=void (){{.*}}] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
+// CHECK: CXXConstructor=Foo:5:3 (unavailable) (default constructor) [type=void (){{.*}}] [typekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [isPOD=0]
diff --git a/test/Index/c-index-api-loadTU-test.m b/test/Index/c-index-api-loadTU-test.m
index 29c37397673a8..52f3a56d98008 100644
--- a/test/Index/c-index-api-loadTU-test.m
+++ b/test/Index/c-index-api-loadTU-test.m
@@ -73,6 +73,7 @@ struct X0  {};
 @interface TestAttributes()
 // <rdar://problem/9561076>
 @property (retain) IBOutlet id anotherOutlet;
+@property (class) int cProp;
 @end
 
 // CHECK: c-index-api-loadTU-test.m:4:12: ObjCInterfaceDecl=Foo:4:12 Extent=[4:1 - 12:5]
@@ -167,7 +168,7 @@ struct X0  {};
 // CHECK: c-index-api-loadTU-test.m:69:16: TypeRef=struct X0:71:8 Extent=[69:16 - 69:18]
 // CHECK: c-index-api-loadTU-test.m:70:8: StructDecl=X0:70:8 Extent=[70:1 - 70:10]
 // CHECK: c-index-api-loadTU-test.m:71:8: StructDecl=X0:71:8 (Definition) Extent=[71:1 - 71:14]
-// CHECK: c-index-api-loadTU-test.m:73:12: ObjCCategoryDecl=:73:12 Extent=[73:1 - 76:5]
+// CHECK: c-index-api-loadTU-test.m:73:12: ObjCCategoryDecl=:73:12 Extent=[73:1 - 77:5]
 // CHECK: c-index-api-loadTU-test.m:73:12: ObjCClassRef=TestAttributes:62:12 Extent=[73:12 - 73:26]
 // CHECK: c-index-api-loadTU-test.m:75:32: ObjCPropertyDecl=anotherOutlet:75:32 [retain,] Extent=[75:1 - 75:45]
 // CHECK: c-index-api-loadTU-test.m:75:20: attribute(iboutlet)= Extent=[75:20 - 75:28]
@@ -175,3 +176,7 @@ struct X0  {};
 // CHECK: c-index-api-loadTU-test.m:75:32: ObjCInstanceMethodDecl=anotherOutlet:75:32 Extent=[75:32 - 75:45]
 // CHECK: c-index-api-loadTU-test.m:75:32: ObjCInstanceMethodDecl=setAnotherOutlet::75:32 Extent=[75:32 - 75:45]
 // CHECK: c-index-api-loadTU-test.m:75:32: ParmDecl=anotherOutlet:75:32 (Definition) Extent=[75:32 - 75:45]
+// CHECK: c-index-api-loadTU-test.m:76:23: ObjCPropertyDecl=cProp:76:23 [class,] Extent=[76:1 - 76:28]
+// CHECK: c-index-api-loadTU-test.m:76:23: ObjCClassMethodDecl=cProp:76:23 Extent=[76:23 - 76:28]
+// CHECK: c-index-api-loadTU-test.m:76:23: ObjCClassMethodDecl=setCProp::76:23 Extent=[76:23 - 76:28]
+// CHECK: c-index-api-loadTU-test.m:76:23: ParmDecl=cProp:76:23 (Definition) Extent=[76:23 - 76:28]
diff --git a/test/Index/evaluate-cursor.cpp b/test/Index/evaluate-cursor.cpp
new file mode 100644
index 0000000000000..28a9c368795ab
--- /dev/null
+++ b/test/Index/evaluate-cursor.cpp
@@ -0,0 +1,30 @@
+// Test is line- and column-sensitive. Run lines are below.
+
+struct Foo {
+  int x = 10;
+};
+
+void foo() {
+  int p = 11;
+}
+
+#define FUNC_MAC(x) x
+
+void goo() {
+  int p = FUNC_MAC(1);
+  int a = __LINE__;
+}
+
+// RUN: c-index-test -evaluate-cursor-at=%s:4:7 \
+// RUN:    -evaluate-cursor-at=%s:8:7 \
+// RUN:    -evaluate-cursor-at=%s:8:11 -std=c++11 %s | FileCheck %s
+// CHECK: Value: 10
+// CHECK: Value: 11
+// CHECK: Value: 11
+
+// RUN: c-index-test -get-macro-info-cursor-at=%s:11:9 \
+// RUN:    -get-macro-info-cursor-at=%s:14:11 \
+// RUN:    -get-macro-info-cursor-at=%s:15:11 -std=c++11 %s | FileCheck -check-prefix=CHECK-MACRO %s
+// CHECK-MACRO: [function macro]
+// CHECK-MACRO: [function macro]
+// CHECK-MACRO: [builtin macro]
diff --git a/test/Index/file-refs.cpp b/test/Index/file-refs.cpp
index a96d27c630718..c5a728b434e61 100644
--- a/test/Index/file-refs.cpp
+++ b/test/Index/file-refs.cpp
@@ -59,7 +59,7 @@ void f() {
 // RUN:  -file-refs-at=%s:2:9 \
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition)
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition) =[2:9 - 2:10]
-// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) =[4:5 - 4:6]
+// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) (default constructor) =[4:5 - 4:6]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[9:10 - 9:11]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[10:3 - 10:4]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[15:7 - 15:8]
@@ -69,7 +69,7 @@ void f() {
 // RUN:  -file-refs-at=%s:16:18 \
 // CHECK-NEXT: CallExpr=C:4:5
 // CHECK-NEXT: ClassDecl=C:2:9 (Definition) =[2:9 - 2:10]
-// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) =[4:5 - 4:6]
+// CHECK-NEXT: CXXConstructor=C:4:5 (Definition) (default constructor) =[4:5 - 4:6]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[9:10 - 9:11]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[10:3 - 10:4]
 // CHECK-NEXT: TypeRef=class NS::C:2:9 =[15:7 - 15:8]
@@ -91,7 +91,7 @@ void f() {
 // CHECK-NEXT: CallExpr=S:35:3
 // CHECK-NEXT: StructDecl=S:34:8 (Definition) =[34:8 - 34:9]
 // CHECK-NEXT: CXXConstructor=S:35:3 =[35:3 - 35:4]
-// CHECK-NEXT: CXXConstructor=S:36:3 =[36:3 - 36:4]
+// CHECK-NEXT: CXXConstructor=S:36:3 (default constructor) =[36:3 - 36:4]
 // CHECK-NEXT: TypeRef=struct Test2::S:34:8 =[39:9 - 39:10]
 // CHECK-NEXT: TypeRef=struct Test2::S:34:8 =[43:14 - 43:15]
 
diff --git a/test/Index/get-cursor.cpp b/test/Index/get-cursor.cpp
index 60aab5f7ed79d..a2c46937e48e9 100644
--- a/test/Index/get-cursor.cpp
+++ b/test/Index/get-cursor.cpp
@@ -208,7 +208,7 @@ void test(TestColl coll) {
 // CHECK-TEMPLSPEC: 66:23 ClassDecl=TC:66:23 (Definition) [Specialization of TC:59:7] Extent=[66:1 - 66:31] Spelling=TC ([66:23 - 66:25])
 
 // RUN: c-index-test -cursor-at=%s:69:3 -cursor-at=%s:70:11 -cursor-at=%s:73:6 -cursor-at=%s:74:6 -cursor-at=%s:77:8 -cursor-at=%s:78:8 -cursor-at=%s:79:8 -cursor-at=%s:80:8 -cursor-at=%s:81:8 -cursor-at=%s:82:8 -cursor-at=%s:85:6 -cursor-at=%s:86:6 -cursor-at=%s:87:6 -cursor-at=%s:88:6 -cursor-at=%s:91:5 -cursor-at=%s:92:5 -cursor-at=%s:93:5 -cursor-at=%s:94:5 -cursor-at=%s:95:5 -cursor-at=%s:96:5 -cursor-at=%s:97:5 -cursor-at=%s:98:5 -cursor-at=%s:100:5 -cursor-at=%s:101:5 -cursor-at=%s:104:6 -cursor-at=%s:105:6 -cursor-at=%s:106:6 -cursor-at=%s:107:6 -cursor-at=%s:108:6 -cursor-at=%s:109:6 -cursor-at=%s:110:6 -cursor-at=%s:111:6 -cursor-at=%s:113:6 -cursor-at=%s:114:6 -cursor-at=%s:117:8 -cursor-at=%s:118:8 -cursor-at=%s:120:8 -cursor-at=%s:121:8 -cursor-at=%s:122:8 -cursor-at=%s:123:8 -cursor-at=%s:124:8 -cursor-at=%s:125:8 -cursor-at=%s:128:6 -cursor-at=%s:129:6 -cursor-at=%s:130:6 -cursor-at=%s:132:3 -std=c++11 %s | FileCheck -check-prefix=CHECK-SPELLING %s
-// CHECK-SPELLING: 69:3 CXXConstructor=A:69:3 Extent=[69:3 - 69:6] Spelling=A ([69:3 - 69:4])
+// CHECK-SPELLING: 69:3 CXXConstructor=A:69:3 (default constructor) Extent=[69:3 - 69:6] Spelling=A ([69:3 - 69:4])
 // CHECK-SPELLING: 70:11 CXXDestructor=~A:70:11 (virtual) Extent=[70:3 - 70:15] Spelling=~A ([70:11 - 70:13])
 // CHECK-SPELLING: 73:6 CXXMethod=operator=:73:6 Extent=[73:3 - 73:25] Spelling=operator= ([73:6 - 73:15])
 // CHECK-SPELLING: 74:6 CXXMethod=operator=:74:6 Extent=[74:3 - 74:29] Spelling=operator= ([74:6 - 74:15])
diff --git a/test/Index/index-decls.m b/test/Index/index-decls.m
index a405abc78c884..a39d9e3bfa6b7 100644
--- a/test/Index/index-decls.m
+++ b/test/Index/index-decls.m
@@ -48,6 +48,13 @@ int test1() {
 }
 @end
 
+// rdar://25372906
+@class I5;
+@interface I5
+-(void)meth;
+@property (class) int c;
+@end
+
 // RUN: c-index-test -index-file %s -target x86_64-apple-macosx10.7 > %t
 // RUN: FileCheck %s -input-file=%t
 // CHECK: [indexDeclaration]: kind: objc-class | name: I | {{.*}} | loc: 1:12
@@ -71,5 +78,9 @@ int test1() {
 // CHECK: [indexEntityReference]: kind: function | name: extfn | {{.*}} | loc: 33:10
 
 // CHECK: [indexDeclaration]: kind: objc-class | name: I4 | {{.*}} | loc: 36:12
+// CHECK: [indexEntityReference]: kind: objc-property | name: prop | {{.*}} | cursor: ObjCSynthesizeDecl=prop:37:34 (Definition) | loc: 43:13 | <parent>:: kind: objc-class | name: I4 | {{.*}} | container: [I4:42:17] | refkind: direct
 // CHECK-NOT: [indexDeclaration]: kind: objc-instance-method {{.*}} loc: 37:
 // CHECK-NOT: [indexDeclaration]: kind: objc-instance-method {{.*}} loc: 43:
+
+// CHECK: [indexDeclaration]: kind: objc-instance-method | name: meth | {{.*}} loc: 54:1 | {{.*}} | isRedecl: 0 | isDef: 0 |
+// CHECK: [indexDeclaration]: kind: objc-property | name: c | USR: c:objc(cs)I5(cpy)c | lang: ObjC | cursor: ObjCPropertyDecl=c:55:23 [class,] | loc: 55:23
diff --git a/test/Index/index-file.cpp b/test/Index/index-file.cpp
index f1ae68a2508de..f2dbabbae7b59 100644
--- a/test/Index/index-file.cpp
+++ b/test/Index/index-file.cpp
@@ -27,7 +27,18 @@ template class A<int>;
 class B {
   mutable int x_;
   int y_;
+
+  B() = default;
+  B(int);
+  explicit B(double);
+  B(const B&);
+  B(B&&);
+};
+
+class C {
+  explicit C(const C&);
 };
+
 // RUN: c-index-test -index-file %s > %t
 // RUN: FileCheck %s -input-file=%t
 
@@ -37,3 +48,9 @@ class B {
 // CHECK: [indexDeclaration]: kind: c++-instance-method | name: meth | {{.*}} | loc: 23:26
 // CHECK: [indexDeclaration]: kind: field | name: x_ | USR: c:@S@B@FI@x_ | lang: C++ | cursor: FieldDecl=x_:28:15 (Definition) (mutable) | loc: 28:15 | semantic-container: [B:27:7] | lexical-container: [B:27:7] | isRedecl: 0 | isDef: 1 | isContainer: 0 | isImplicit: 0
 // CHECK: [indexDeclaration]: kind: field | name: y_ | USR: c:@S@B@FI@y_ | lang: C++ | cursor: FieldDecl=y_:29:7 (Definition) | loc: 29:7 | semantic-container: [B:27:7] | lexical-container: [B:27:7] | isRedecl: 0 | isDef: 1 | isContainer: 0 | isImplicit: 0
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (default constructor) (defaulted) | loc: 31:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (converting constructor) | loc: 32:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} | loc: 33:12
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (copy constructor) (converting constructor) | loc: 34:3
+// CHECK: [indexDeclaration]: kind: constructor | name: B | {{.*}} (move constructor) (converting constructor) | loc: 35:3
+// CHECK: [indexDeclaration]: kind: constructor | name: C | {{.*}} (copy constructor) | loc: 39:12
diff --git a/test/Index/index-many-logical-ops.c b/test/Index/index-many-logical-ops.c
index 0fd4e75236f1c..7940a21a39b7f 100644
--- a/test/Index/index-many-logical-ops.c
+++ b/test/Index/index-many-logical-ops.c
@@ -4,7 +4,7 @@
 // Check that we don't get stack overflow trying to index a huge number of
 // logical operators.
 
-// UBSan increses stack usage.
+// UBSan increases stack usage.
 // REQUIRES: not_ubsan
 
 // CHECK: [indexDeclaration]: kind: function | name: foo
diff --git a/test/Index/index-module.m b/test/Index/index-module.m
index ff512592b6708..d1c0b36b389ed 100644
--- a/test/Index/index-module.m
+++ b/test/Index/index-module.m
@@ -3,9 +3,11 @@
 @import DependsOnModule;
 int glob;
 
-// RUN: rm -rf %t.cache
+// RUN: rm -rf %t.cache %t.cache.sys
 // RUN: c-index-test -index-file %s -fmodules-cache-path=%t.cache -fmodules -F %S/../Modules/Inputs \
 // RUN:      -Xclang -fdisable-module-hash | FileCheck %s
+// RUN: c-index-test -index-file %s -fmodules-cache-path=%t.cache.sys -fmodules -iframework %S/../Modules/Inputs \
+// RUN:      -Xclang -fdisable-module-hash | FileCheck %s
 // RUN: c-index-test -index-file %s -fmodules-cache-path=%t.cache -fmodules -gmodules -F %S/../Modules/Inputs \
 // RUN:      -Xclang -fdisable-module-hash | FileCheck %s
 
@@ -18,6 +20,7 @@ int glob;
 // CHECK-NOT: [indexDeclaration]
 
 // RUN: c-index-test -index-tu %t.cache/DependsOnModule.pcm | FileCheck %s -check-prefix=CHECK-DMOD
+// RUN: c-index-test -index-tu %t.cache.sys/DependsOnModule.pcm | FileCheck %s -check-prefix=CHECK-DMOD-AST
 
 // CHECK-DMOD:      [startedTranslationUnit]
 // CHECK-DMOD-NEXT: [ppIncludedFile]: [[DMOD_MODULE_H:.*/Modules/Inputs/DependsOnModule\.framework[/\\]Headers[/\\]DependsOnModule\.h]] | {{.*}} | hash loc: <invalid> | {{.*}} | module: DependsOnModule
@@ -27,7 +30,7 @@ int glob;
 // CHECK-DMOD-NEXT: [ppIncludedFile]: [[DMOD_SUB_H:.*/Modules/Inputs/DependsOnModule\.framework[/\\]Frameworks[/\\]SubFramework\.framework[/\\]Headers[/\\]SubFramework\.h]] | {{.*}} | hash loc: <invalid> | {{.*}} | module: DependsOnModule.SubFramework
 // CHECK-DMOD-NEXT: [ppIncludedFile]: [[DMOD_SUB_OTHER_H:.*/Modules/Inputs/DependsOnModule.framework[/\\]Frameworks/SubFramework\.framework/Headers/Other\.h]] | name: "SubFramework/Other.h" | hash loc: [[DMOD_SUB_H]]:1:1 | isImport: 0 | isAngled: 0 | isModule: 0 | module: DependsOnModule.SubFramework.Other
 // CHECK-DMOD-NEXT: [ppIncludedFile]: [[DMOD_PRIVATE_H:.*/Modules/Inputs/DependsOnModule.framework[/\\]PrivateHeaders[/\\]DependsOnModulePrivate.h]] | {{.*}} | hash loc: <invalid> | {{.*}} | module: DependsOnModule.Private.DependsOnModule
-// CHECK-DMOD-NEXT: [importedASTFile]: {{.*}}.cache{{[/\\]}}Module.pcm | loc: [[DMOD_MODULE_H]]:1:1 | name: "Module" | isImplicit: 1
+// CHECK-DMOD-NEXT: [importedASTFile]: {{.*}}.cache{{(.sys)?[/\\]}}Module.pcm | loc: [[DMOD_MODULE_H]]:1:1 | name: "Module" | isImplicit: 1
 // CHECK-DMOD-NEXT: [indexDeclaration]: kind: variable | name: depends_on_module_other | {{.*}} | loc: [[DMOD_OTHER_H]]:1:5
 // CHECK-DMOD-NEXT: [indexDeclaration]: kind: variable | name: template | {{.*}} | loc: [[DMOD_NOT_CXX_H]]:1:12
 // CHECK-DMOD-NEXT: [indexDeclaration]: kind: variable | name: sub_framework | {{.*}} | loc: [[DMOD_SUB_H]]:2:8
@@ -35,6 +38,8 @@ int glob;
 // CHECK-DMOD-NEXT: [indexDeclaration]: kind: variable | name: depends_on_module_private | {{.*}} | loc: [[DMOD_PRIVATE_H]]:1:5
 // CHECK-DMOD-NOT: [indexDeclaration]
 
+// CHECK-DMOD-AST: [importedASTFile]: {{.*}}.cache.sys{{[/\\]}}Module.pcm | loc: {{.*}}DependsOnModule.h:1:1 | name: "Module" | isImplicit: 1
+
 // RUN: c-index-test -index-tu %t.cache/Module.pcm | FileCheck %s -check-prefix=CHECK-TMOD
 
 // CHECK-TMOD:      [startedTranslationUnit]
diff --git a/test/Index/index-refs.cpp b/test/Index/index-refs.cpp
index adbf02a7c6e22..5a1ba1e8b43ae 100644
--- a/test/Index/index-refs.cpp
+++ b/test/Index/index-refs.cpp
@@ -69,6 +69,8 @@ void foo5() {
   struct S2 s = { .y = 1, .x = 4};
 }
 
+int ginitlist[] = {EnumVal};
+
 // RUN: c-index-test -index-file %s | FileCheck %s
 // CHECK:      [indexDeclaration]: kind: namespace | name: NS
 // CHECK-NEXT: [indexDeclaration]: kind: variable | name: gx
@@ -119,3 +121,9 @@ void foo5() {
 
 // CHECK:      [indexEntityReference]: kind: field | name: y | {{.*}} | loc: 69:20
 // CHECK-NEXT: [indexEntityReference]: kind: field | name: x | {{.*}} | loc: 69:28
+// CHECK-NOT:  [indexEntityReference]: kind: field | name: y | {{.*}} | loc: 69:20
+// CHECK-NOT:  [indexEntityReference]: kind: field | name: x | {{.*}} | loc: 69:28
+
+// CHECK:      [indexDeclaration]: kind: variable | name: ginitlist |
+// CHECK:      [indexEntityReference]: kind: enumerator | name: EnumVal | {{.*}} | loc: 72:20
+// CHECK-NOT:  [indexEntityReference]: kind: enumerator | name: EnumVal | {{.*}} | loc: 72:20
diff --git a/test/Index/index-refs.m b/test/Index/index-refs.m
index f25013b882f6e..457712bcbc7ec 100644
--- a/test/Index/index-refs.m
+++ b/test/Index/index-refs.m
@@ -21,7 +21,12 @@ void foo2() {
   [I clsMeth];
 }
 
+@protocol ForwardProt;
+
 // RUN: c-index-test -index-file %s | FileCheck %s
 // CHECK: [indexEntityReference]: kind: objc-protocol | name: Prot | {{.*}} | loc: 12:27
 // CHECK: [indexEntityReference]: kind: struct | name: FooS | {{.*}} | loc: 13:18
 // CHECK: [indexEntityReference]: kind: objc-class | name: I | {{.*}} | loc: 21:4
+
+// CHECK: [indexDeclaration]: kind: objc-protocol | name: ForwardProt | {{.*}} | loc: 24:11
+// CHECK-NEXT: <ObjCContainerInfo>: kind: forward-ref
diff --git a/test/Index/keep-going.cpp b/test/Index/keep-going.cpp
new file mode 100644
index 0000000000000..a25d1c4b04124
--- /dev/null
+++ b/test/Index/keep-going.cpp
@@ -0,0 +1,29 @@
+#include "missing1.h"
+
+template<class T>
+class A { T a; };
+
+class B : public A<int> { };
+
+#include "missing2.h"
+
+class C : public A<float> { };
+
+// RUN: env CINDEXTEST_EDITING=1 CINDEXTEST_KEEP_GOING=1 c-index-test -test-print-type %s -std=c++03 2> %t.stderr.txt  | FileCheck %s
+// RUN: FileCheck -check-prefix CHECK-DIAG %s < %t.stderr.txt
+
+// CHECK: inclusion directive=missing1.h ((null)) [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: inclusion directive=missing2.h ((null)) [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: ClassTemplate=A:4:7 (Definition) [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: TemplateTypeParameter=T:3:16 (Definition) [type=T] [typekind=Unexposed] [canonicaltype=type-parameter-0-0] [canonicaltypekind=Unexposed] [isPOD=0]
+// CHECK: FieldDecl=a:4:13 (Definition) [type=T] [typekind=Unexposed] [canonicaltype=type-parameter-0-0] [canonicaltypekind=Unexposed] [isPOD=0]
+// CHECK: TypeRef=T:3:16 [type=T] [typekind=Unexposed] [canonicaltype=type-parameter-0-0] [canonicaltypekind=Unexposed] [isPOD=0]
+// CHECK: ClassDecl=B:6:7 (Definition) [type=B] [typekind=Record] [isPOD=0]
+// CHECK: C++ base class specifier=A<int>:4:7 [access=public isVirtual=false] [type=A<int>] [typekind=Unexposed] [canonicaltype=A<int>] [canonicaltypekind=Record] [templateargs/1= [type=int] [typekind=Int]] [isPOD=0] [nbFields=1]
+// CHECK: TemplateRef=A:4:7 [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: ClassDecl=C:10:7 (Definition) [type=C] [typekind=Record] [isPOD=0]
+// CHECK: C++ base class specifier=A<float>:4:7 [access=public isVirtual=false] [type=A<float>] [typekind=Unexposed] [canonicaltype=A<float>] [canonicaltypekind=Record] [templateargs/1= [type=float] [typekind=Float]] [isPOD=0] [nbFields=1]
+// CHECK: TemplateRef=A:4:7 [type=] [typekind=Invalid] [isPOD=0]
+
+// CHECK-DIAG: keep-going.cpp:1:10: error: 'missing1.h' file not found
+// CHECK-DIAG: keep-going.cpp:8:10: error: 'missing2.h' file not found
diff --git a/test/Index/load-classes.cpp b/test/Index/load-classes.cpp
index 3b66be5398afe..f527db521089e 100644
--- a/test/Index/load-classes.cpp
+++ b/test/Index/load-classes.cpp
@@ -25,10 +25,10 @@ X::X(int value) {
 
 // RUN: c-index-test -test-load-source all %s | FileCheck %s
 // CHECK: load-classes.cpp:3:8: StructDecl=X:3:8 (Definition) Extent=[3:1 - 21:2]
-// CHECK: load-classes.cpp:4:3: CXXConstructor=X:4:3 Extent=[4:3 - 4:15] [access=public]
+// CHECK: load-classes.cpp:4:3: CXXConstructor=X:4:3 (converting constructor) Extent=[4:3 - 4:15] [access=public]
 // FIXME: missing TypeRef in the constructor name
 // CHECK: load-classes.cpp:4:9: ParmDecl=value:4:9 (Definition) Extent=[4:5 - 4:14]
-// CHECK: load-classes.cpp:5:3: CXXConstructor=X:5:3 Extent=[5:3 - 5:16] [access=public]
+// CHECK: load-classes.cpp:5:3: CXXConstructor=X:5:3 (copy constructor) (converting constructor) Extent=[5:3 - 5:16] [access=public]
 // FIXME: missing TypeRef in the constructor name
 // CHECK: load-classes.cpp:5:14: ParmDecl=x:5:14 (Definition) Extent=[5:5 - 5:15]
 // CHECK: load-classes.cpp:5:11: TypeRef=struct X:3:8 Extent=[5:11 - 5:12]
@@ -46,7 +46,7 @@ X::X(int value) {
 // CHECK: load-classes.cpp:16:21: TemplateTypeParameter=T:16:21 (Definition) Extent=[16:12 - 16:22] [access=public]
 // CHECK: load-classes.cpp:19:16: CXXMethod=virtualMemberFunction:19:16 (virtual) Extent=[19:3 - 19:39] [access=private]
 // CHECK: load-classes.cpp:20:16: CXXMethod=pureVirtualMemberFunction:20:16 (virtual) (pure) Extent=[20:3 - 20:47] [access=private]
-// CHECK: load-classes.cpp:23:4: CXXConstructor=X:23:4 (Definition) Extent=[23:1 - 24:2] [access=public]
+// CHECK: load-classes.cpp:23:4: CXXConstructor=X:23:4 (Definition) (converting constructor) Extent=[23:1 - 24:2] [access=public]
 // CHECK: load-classes.cpp:23:1: TypeRef=struct X:3:8 Extent=[23:1 - 23:2]
 // CHECK: load-classes.cpp:23:10: ParmDecl=value:23:10 (Definition) Extent=[23:6 - 23:15]
 // CHECK: load-classes.cpp:23:17: CompoundStmt= Extent=[23:17 - 24:2]
diff --git a/test/Index/pch-warn-as-error-code-split.cpp b/test/Index/pch-warn-as-error-code-split.cpp
new file mode 100644
index 0000000000000..f9efc8f95db45
--- /dev/null
+++ b/test/Index/pch-warn-as-error-code-split.cpp
@@ -0,0 +1,17 @@
+// RUN: env CINDEXTEST_EDITING=1 c-index-test -test-load-source local %s -Wuninitialized -Werror=unused 2>&1 | FileCheck -check-prefix=DIAGS %s
+
+// Make sure -Wuninitialized works even though the header had a warn-as-error occurrence.
+
+// DIAGS: error: unused variable 'x'
+// DIAGS: warning: variable 'x1' is uninitialized
+// DIAGS-NOT: error: use of undeclared identifier
+// DIAGS: warning: variable 'x1' is uninitialized
+
+#include "pch-warn-as-error-code-split.h"
+
+void test() {
+  int x1; // expected-note {{initialize}}
+  int x2 = x1; // expected-warning {{uninitialized}}
+  (void)x2;
+  foo_head();
+}
diff --git a/test/Index/pch-warn-as-error-code-split.h b/test/Index/pch-warn-as-error-code-split.h
new file mode 100644
index 0000000000000..5893ee2a3f760
--- /dev/null
+++ b/test/Index/pch-warn-as-error-code-split.h
@@ -0,0 +1,4 @@
+
+static void foo_head() {
+  int x;
+}
diff --git a/test/Index/pch-warn-as-error-code.cpp b/test/Index/pch-warn-as-error-code.cpp
new file mode 100644
index 0000000000000..6a7924a09e631
--- /dev/null
+++ b/test/Index/pch-warn-as-error-code.cpp
@@ -0,0 +1,27 @@
+// RUN: rm -f %t.head.h.pch
+// RUN: c-index-test -write-pch %t.head.h.pch %s -Wuninitialized -Werror=unused 2>&1 | FileCheck -check-prefix=HEAD_DIAGS %s
+// RUN: c-index-test -test-load-source local %s -include %t.head.h -Wuninitialized -Werror=unused 2>&1 | FileCheck -check-prefix=MAIN_DIAGS %s
+
+// Make sure -Wuninitialized works even though the header had a warn-as-error occurrence.
+
+// HEAD_DIAGS: error: unused variable 'x'
+// MAIN_DIAGS: warning: variable 'x1' is uninitialized
+// MAIN_DIAGS-NOT: error: use of undeclared identifier
+
+#ifndef HEADER
+#define HEADER
+
+static void foo_head() {
+  int x;
+}
+
+#else
+
+void test() {
+  int x1; // expected-note {{initialize}}
+  int x2 = x1; // expected-warning {{uninitialized}}
+  (void)x2;
+  foo_head();
+}
+
+#endif
diff --git a/test/Index/preamble-with-implicit-import.m b/test/Index/preamble-with-implicit-import.m
new file mode 100644
index 0000000000000..e3d0e8b1a62bc
--- /dev/null
+++ b/test/Index/preamble-with-implicit-import.m
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// RUN: env CINDEXTEST_EDITING=1 c-index-test -test-load-source-reparse 2 none %s -I %S/Inputs -fmodules -fmodules-cache-path=%t -fspell-checking 2>&1 | FileCheck %s
+// CHECK: error: declaration of 'Typo' must be imported
+// CHECK: error: declaration of 'Typo' must be imported
+
+#include "preamble-with-implicit-import.h"
diff --git a/test/Index/print-cxx-manglings.cpp b/test/Index/print-cxx-manglings.cpp
index aae2993017276..27d7988eb79a1 100644
--- a/test/Index/print-cxx-manglings.cpp
+++ b/test/Index/print-cxx-manglings.cpp
@@ -64,3 +64,33 @@ struct v {
 
 // MSVC: CXXConstructor=v{{.*}}[mangled=??0v@@QAE@H@Z] [mangled=??_Fv@@QAEXXZ]
 
+struct w {
+  virtual int m(int);
+};
+
+// ITANIUM: CXXMethod=m{{.*}} (virtual) [mangled=_ZN1w1mEi]
+
+// MACHO: CXXMethod=m{{.*}} (virtual) [mangled=__ZN1w1mEi]
+
+// MSVC: CXXMethod=m{{.*}} (virtual) [mangled=?m@w@@UAEHH@Z]
+
+struct x {
+  virtual int m(int);
+};
+
+// ITANIUM: CXXMethod=m{{.*}} (virtual) [mangled=_ZN1x1mEi]
+
+// MACHO: CXXMethod=m{{.*}} (virtual) [mangled=__ZN1x1mEi]
+
+// MSVC: CXXMethod=m{{.*}} (virtual) [mangled=?m@x@@UAEHH@Z]
+
+struct y : w, x {
+  virtual int m(int);
+};
+
+// ITANIUM: CXXMethod=m{{.*}} (virtual) {{.*}} [mangled=_ZN1y1mEi] [mangled=_ZThn4_N1y1mEi]
+
+// MACHO: CXXMethod=m{{.*}} (virtual) {{.*}} [mangled=__ZN1y1mEi] [mangled=__ZThn4_N1y1mEi]
+
+// MSVC: CXXMethod=m{{.*}} (virtual) {{.*}} [mangled=?m@y@@UAEHH@Z] [mangled=?m@y@@W3AEHH@Z]
+
diff --git a/test/Index/print-type.c b/test/Index/print-type.c
index 35aab711db3d8..ebe42970eeb3c 100644
--- a/test/Index/print-type.c
+++ b/test/Index/print-type.c
@@ -12,6 +12,9 @@ typedef int __attribute__((vector_size(16))) int4_t;
 
 int f2(int incompletearray[]);
 
+enum Enum{i}; enum Enum elaboratedEnumType();
+struct Struct{}; struct Struct elaboratedStructType();
+
 // RUN: c-index-test -test-print-type %s | FileCheck %s
 // CHECK: FunctionDecl=f:3:6 (Definition) [type=int *(int *, char *, FooType, int *, void (*)(int))] [typekind=FunctionProto] [canonicaltype=int *(int *, char *, int, int *, void (*)(int))] [canonicaltypekind=FunctionProto] [resulttype=int *] [resulttypekind=Pointer] [args= [int *] [Pointer] [char *] [Pointer] [FooType] [Typedef] [int [5]] [ConstantArray] [void (*)(int)] [Pointer]] [isPOD=0]
 // CHECK: ParmDecl=p:3:13 (Definition) [type=int *] [typekind=Pointer] [isPOD=1] [pointeetype=int] [pointeekind=Int]
@@ -45,3 +48,8 @@ int f2(int incompletearray[]);
 // CHECK: VarDecl=x:10:38 [type=__attribute__((__vector_size__(4 * sizeof(int)))) int] [typekind=Vector] [isPOD=1]
 // CHECK: TypedefDecl=int4_t:11:46 (Definition) [type=int4_t] [typekind=Typedef] [canonicaltype=__attribute__((__vector_size__(4 * sizeof(int)))) int] [canonicaltypekind=Vector] [isPOD=1]
 // CHECK: ParmDecl=incompletearray:13:12 (Definition) [type=int []] [typekind=IncompleteArray] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedEnumType:15:25 [type=enum Enum ()] [typekind=FunctionNoProto] [canonicaltype=enum Enum ()] [canonicaltypekind=FunctionNoProto] [resulttype=enum Enum] [resulttypekind=Elaborated] [isPOD=0]
+// CHECK: TypeRef=enum Enum:15:6 [type=enum Enum] [typekind=Enum] [isPOD=1]
+// CHECK: StructDecl=Struct:16:8 (Definition) [type=struct Struct] [typekind=Record] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedStructType:16:32 [type=struct Struct ()] [typekind=FunctionNoProto] [canonicaltype=struct Struct ()] [canonicaltypekind=FunctionNoProto] [resulttype=struct Struct] [resulttypekind=Elaborated] [isPOD=0]
+// CHECK: TypeRef=struct Struct:16:8 [type=struct Struct] [typekind=Record] [isPOD=1]
diff --git a/test/Index/print-type.cpp b/test/Index/print-type.cpp
index 61135e3e8ace2..44fc11c365a48 100644
--- a/test/Index/print-type.cpp
+++ b/test/Index/print-type.cpp
@@ -48,7 +48,7 @@ struct Blob {
 };
 int Blob::*member_pointer;
 
-
+namespace NS { struct Type{}; } NS::Type elaboratedNamespaceType(const NS::Type t);
 
 auto autoI = 0;
 auto autoTbar = tbar<int>(0);
@@ -68,8 +68,8 @@ decltype(auto) autoInt = 5;
 // CHECK: TemplateTemplateParameter=W:8:60 (Definition) [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: Namespace=inner:14:11 (Definition) [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: StructDecl=Bar:16:8 (Definition) [type=outer::inner::Bar] [typekind=Record] [isPOD=0] [nbFields=3]
-// CHECK: CXXConstructor=Bar:17:3 (Definition) [type=void (outer::Foo<bool> *){{.*}}] [typekind=FunctionProto] [canonicaltype=void (outer::Foo<bool> *){{.*}}] [canonicaltypekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [args= [outer::Foo<bool> *] [Pointer]] [isPOD=0]
-// CHECK: ParmDecl=foo:17:25 (Definition) [type=outer::Foo<bool> *] [typekind=Pointer] [canonicaltype=outer::Foo<bool> *] [canonicaltypekind=Pointer] [isPOD=1] [pointeetype=outer::Foo<bool>] [pointeekind=Unexposed]
+// CHECK: CXXConstructor=Bar:17:3 (Definition) (converting constructor) [type=void (outer::Foo<bool> *){{.*}}] [typekind=FunctionProto] [canonicaltype=void (outer::Foo<bool> *){{.*}}] [canonicaltypekind=FunctionProto] [resulttype=void] [resulttypekind=Void] [args= [outer::Foo<bool> *] [Pointer]] [isPOD=0]
+// CHECK: ParmDecl=foo:17:25 (Definition) [type=outer::Foo<bool> *] [typekind=Pointer] [canonicaltype=outer::Foo<bool> *] [canonicaltypekind=Pointer] [isPOD=1] [pointeetype=outer::Foo<bool>] [pointeekind=Elaborated]
 // CHECK: NamespaceRef=outer:1:11 [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: TemplateRef=Foo:4:8 [type=] [typekind=Invalid] [isPOD=0]
 // CHECK: CompoundStmt= [type=] [typekind=Invalid] [isPOD=0]
@@ -127,6 +127,10 @@ decltype(auto) autoInt = 5;
 // CHECK: StructDecl=Blob:45:8 (Definition) [type=Blob] [typekind=Record] [isPOD=1] [nbFields=2]
 // CHECK: FieldDecl=i:46:7 (Definition) [type=int] [typekind=Int] [isPOD=1]
 // CHECK: VarDecl=member_pointer:49:12 (Definition) [type=int Blob::*] [typekind=MemberPointer] [isPOD=1]
+// CHECK: FunctionDecl=elaboratedNamespaceType:51:42 [type=NS::Type (const NS::Type)] [typekind=FunctionProto] [canonicaltype=NS::Type (NS::Type)] [canonicaltypekind=FunctionProto] [resulttype=NS::Type] [resulttypekind=Elaborated] [args= [const NS::Type] [Elaborated]] [isPOD=0]
+// CHECK: NamespaceRef=NS:51:11 [type=] [typekind=Invalid] [isPOD=0]
+// CHECK: TypeRef=struct NS::Type:51:23 [type=NS::Type] [typekind=Record] [isPOD=1]
+// CHECK: ParmDecl=t:51:81 (Definition) [type=const NS::Type] [typekind=Elaborated] const [canonicaltype=const NS::Type] [canonicaltypekind=Record] [isPOD=1]
 // CHECK: VarDecl=autoI:53:6 (Definition) [type=int] [typekind=Auto] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1]
 // CHECK: IntegerLiteral= [type=int] [typekind=Int] [isPOD=1]
 // CHECK: VarDecl=autoTbar:54:6 (Definition) [type=int] [typekind=Auto] [canonicaltype=int] [canonicaltypekind=Int] [isPOD=1]
diff --git a/test/Index/print-type.m b/test/Index/print-type.m
index 777069b3a58b3..392399a573f1c 100644
--- a/test/Index/print-type.m
+++ b/test/Index/print-type.m
@@ -4,6 +4,7 @@
 -(const id) mymethod2:(id)x blah:(Class)y boo:(SEL)z;
 -(bycopy)methodIn:(in int)i andOut:(out short *)j , ...;
 -(void)kindof_meth:(__kindof Foo *)p;
+@property (class) int classProp;
 @end
 
 // RUN: c-index-test -test-print-type %s | FileCheck %s
@@ -15,3 +16,4 @@
 // CHECK: ParmDecl=i:5:27 (Definition) [In,] [type=int] [typekind=Int] [isPOD=1]
 // CHECK: ParmDecl=j:5:49 (Definition) [Out,] [type=short *] [typekind=Pointer] [isPOD=1] [pointeetype=short] [pointeekind=Short]
 // CHECK: ParmDecl=p:6:36 (Definition) [type=__kindof Foo *] [typekind=ObjCObjectPointer] [canonicaltype=__kindof Foo *] [canonicaltypekind=ObjCObjectPointer] [isPOD=1] [pointeetype=Foo] [pointeekind=ObjCInterface]
+// CHECK: ObjCPropertyDecl=classProp:7:23 [class,] [type=int] [typekind=Int] [isPOD=1]
diff --git a/test/Index/properties-class-extensions.m b/test/Index/properties-class-extensions.m
index 0fa0ecba6b634..7af6553fbd9d7 100644
--- a/test/Index/properties-class-extensions.m
+++ b/test/Index/properties-class-extensions.m
@@ -70,7 +70,7 @@
 // CHECK-NOT: properties-class-extensions.m:16:25: ObjCInstanceMethodDecl=bar:16:25 Extent=[16:25 - 16:28]
 // CHECK: properties-class-extensions.m:19:26: ObjCInstanceMethodDecl=setBar::19:26 Extent=[19:26 - 19:29]
 // CHECK: properties-class-extensions.m:19:26: ParmDecl=bar:19:26 (Definition) Extent=[19:26 - 19:29]
-// CHECK: properties-class-extensions.m:24:8: ObjCInterfaceDecl=Rdar8467189_Bar:24:8 Extent=[24:1 - 24:23]
+// CHECK-NOT: properties-class-extensions.m:24:8: ObjCInterfaceDecl=Rdar8467189_Bar:24:8
 // CHECK: properties-class-extensions.m:24:8: ObjCClassRef=Rdar8467189_Bar:24:8 Extent=[24:8 - 24:23]
 // CHECK: properties-class-extensions.m:25:11: ObjCProtocolDecl=Rdar8467189_FooProtocol:25:11 (Definition) Extent=[25:1 - 27:5]
 // CHECK: properties-class-extensions.m:26:39: ObjCPropertyDecl=Rdar8467189_Bar:26:39 [readonly,] Extent=[26:1 - 26:54]
diff --git a/test/Index/recursive-cxx-member-calls.cpp b/test/Index/recursive-cxx-member-calls.cpp
index 34a56529548f9..36d617f48b09d 100644
--- a/test/Index/recursive-cxx-member-calls.cpp
+++ b/test/Index/recursive-cxx-member-calls.cpp
@@ -1653,7 +1653,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) {
 // CHECK: 45:58: DeclRefExpr=a:45:28 Extent=[45:58 - 45:59]
 // CHECK: 45:62: DeclRefExpr=b:45:38 Extent=[45:62 - 45:63]
 // CHECK: 46:1: CXXAccessSpecifier=:46:1 (Definition) Extent=[46:1 - 46:8]
-// CHECK: 47:3: CXXConstructor=StringRef:47:3 (Definition) Extent=[47:3 - 47:37]
+// CHECK: 47:3: CXXConstructor=StringRef:47:3 (Definition) (default constructor) Extent=[47:3 - 47:37]
 // CHECK: 47:16: MemberRef=Data:43:15 Extent=[47:16 - 47:20]
 // CHECK: 47:21: UnexposedExpr= Extent=[47:21 - 47:22]
 // CHECK: 47:21: IntegerLiteral= Extent=[47:21 - 47:22]
@@ -1661,7 +1661,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) {
 // CHECK: 47:32: UnexposedExpr= Extent=[47:32 - 47:33]
 // CHECK: 47:32: IntegerLiteral= Extent=[47:32 - 47:33]
 // CHECK: 47:35: CompoundStmt= Extent=[47:35 - 47:37]
-// CHECK: 48:3: CXXConstructor=StringRef:48:3 (Definition) Extent=[48:3 - 48:71]
+// CHECK: 48:3: CXXConstructor=StringRef:48:3 (Definition) (converting constructor) Extent=[48:3 - 48:71]
 // CHECK: 48:25: ParmDecl=Str:48:25 (Definition) Extent=[48:13 - 48:28]
 // CHECK: 48:32: MemberRef=Data:43:15 Extent=[48:32 - 48:36]
 // CHECK: 48:37: DeclRefExpr=Str:48:25 Extent=[48:37 - 48:40]
@@ -1768,7 +1768,7 @@ AttributeList::Kind AttributeList::getKind(const IdentifierInfo * Name) {
 // CHECK: 65:11: Namespace=clang:65:11 (Definition) Extent=[65:1 - 81:2]
 // CHECK: 66:7: ClassDecl=IdentifierInfo:66:7 (Definition) Extent=[66:1 - 80:2]
 // CHECK: 67:1: CXXAccessSpecifier=:67:1 (Definition) Extent=[67:1 - 67:8]
-// CHECK: 67:8: CXXConstructor=IdentifierInfo:67:8 Extent=[67:8 - 67:24]
+// CHECK: 67:8: CXXConstructor=IdentifierInfo:67:8 (default constructor) Extent=[67:8 - 67:24]
 // CHECK: 68:15: CXXMethod=getNameStart:68:15 (Definition) (const) Extent=[68:3 - 71:4] [access=public]
 // CHECK: 68:36: CompoundStmt= Extent=[68:36 - 71:4]
 // CHECK: 69:5: DeclStmt= Extent=[69:5 - 69:65]
diff --git a/test/Index/remap-load.c b/test/Index/remap-load.c
index f433fa7eac756..f886cea7da754 100644
--- a/test/Index/remap-load.c
+++ b/test/Index/remap-load.c
@@ -1,4 +1,4 @@
-// RUN: c-index-test -test-load-source all -remap-file="%s,%S/Inputs/remap-load-to.c" %s | FileCheck -check-prefix=CHECK %s
+// RUN: c-index-test -test-load-source all -remap-file="%s,%S/Inputs/remap-load-to.c" %s | FileCheck %s
 
 // CHECK: remap-load.c:1:5: FunctionDecl=foo:1:5 (Definition) Extent=[1:1 - 3:2]
 // CHECK: remap-load.c:1:13: ParmDecl=parm1:1:13 (Definition) Extent=[1:9 - 1:18]
diff --git a/test/Index/retain-comments-from-system-headers.c b/test/Index/retain-comments-from-system-headers.c
index 490699dbd884f..ac4f4fa8e3cd4 100644
--- a/test/Index/retain-comments-from-system-headers.c
+++ b/test/Index/retain-comments-from-system-headers.c
@@ -13,7 +13,7 @@ int user_function(int a);
 // RUN: c-index-test -test-load-source all %s -fretain-comments-from-system-headers -I %S/Inputs | FileCheck %s -check-prefix=CHECK-RETAIN
 
 // Modules:
-// RUN: c-index-test -test-load-source all %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s -check-prefix=CHECK
+// RUN: c-index-test -test-load-source all %s -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s
 // RUN: c-index-test -test-load-source all %s -fretain-comments-from-system-headers -I %S/Inputs -fmodules -fmodules-cache-path=%t/cache -fmodule-map-file=%S/Inputs/retain-comments-from-system-headers-module.map | FileCheck %s -check-prefix=CHECK-RETAIN
 
 // CHECK: retain-comments-from-system-headers.h:7:5: FunctionDecl=system_function:7:5 Extent=[7:1 - 7:27]
diff --git a/test/Index/skip-parsed-bodies/compile_commands.json b/test/Index/skip-parsed-bodies/compile_commands.json
index 30ede0db1015d..62303cbc87dbc 100644
--- a/test/Index/skip-parsed-bodies/compile_commands.json
+++ b/test/Index/skip-parsed-bodies/compile_commands.json
@@ -1,22 +1,21 @@
 [
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t1.cpp",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t1.cpp",
   "file": "t1.cpp"
 },
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t2.cpp -DBLAH",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t2.cpp -DBLAH",
   "file": "t2.cpp"
 },
 {
   "directory": ".",
-  "command": "/usr/bin/clang++ -fsyntax-only t3.cpp -DBLAH",
+  "command": "/usr/bin/clang++ -fsyntax-only -fno-ms-compatibility -fno-delayed-template-parsing t3.cpp -DBLAH",
   "file": "t2.cpp"
 }
 ]
 
-// XFAIL: mingw32,win32,windows-gnu
 // RUN: c-index-test -index-compile-db %s | FileCheck %s
 
 // CHECK:      [startedTranslationUnit]
diff --git a/test/Index/usrs.m b/test/Index/usrs.m
index fc3fbc9105789..92c3a3fafee2c 100644
--- a/test/Index/usrs.m
+++ b/test/Index/usrs.m
@@ -110,12 +110,12 @@ int test_multi_declaration(void) {
 // CHECK: usrs.m c:usrs.m@F@my_helper Extent=[3:1 - 3:60]
 // CHECK: usrs.m c:usrs.m@95@F@my_helper@x Extent=[3:29 - 3:34]
 // CHECK: usrs.m c:usrs.m@102@F@my_helper@y Extent=[3:36 - 3:41]
-// CHECK: usrs.m c:usrs.m@Ea Extent=[5:1 - 8:2]
-// CHECK: usrs.m c:usrs.m@Ea@ABA Extent=[6:3 - 6:6]
-// CHECK: usrs.m c:usrs.m@Ea@CADABA Extent=[7:3 - 7:9]
-// CHECK: usrs.m c:usrs.m@Ea Extent=[10:1 - 13:2]
-// CHECK: usrs.m c:usrs.m@Ea@FOO Extent=[11:3 - 11:6]
-// CHECK: usrs.m c:usrs.m@Ea@BAR Extent=[12:3 - 12:6]
+// CHECK: usrs.m c:@Ea@ABA Extent=[5:1 - 8:2]
+// CHECK: usrs.m c:@Ea@ABA@ABA Extent=[6:3 - 6:6]
+// CHECK: usrs.m c:@Ea@ABA@CADABA Extent=[7:3 - 7:9]
+// CHECK: usrs.m c:@Ea@FOO Extent=[10:1 - 13:2]
+// CHECK: usrs.m c:@Ea@FOO@FOO Extent=[11:3 - 11:6]
+// CHECK: usrs.m c:@Ea@FOO@BAR Extent=[12:3 - 12:6]
 // CHECK: usrs.m c:@SA@MyStruct Extent=[15:9 - 18:2]
 // CHECK: usrs.m c:@SA@MyStruct@FI@wa Extent=[16:3 - 16:9]
 // CHECK: usrs.m c:@SA@MyStruct@FI@moo Extent=[17:3 - 17:10]
diff --git a/test/Layout/ms-x86-declspec-empty_bases.cpp b/test/Layout/ms-x86-declspec-empty_bases.cpp
new file mode 100644
index 0000000000000..cc13a980cb5db
--- /dev/null
+++ b/test/Layout/ms-x86-declspec-empty_bases.cpp
@@ -0,0 +1,266 @@
+// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple i686-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN:            | FileCheck %s
+// RUN: %clang_cc1 -fno-rtti -emit-llvm-only -triple x86_64-pc-win32 -fms-extensions -fdump-record-layouts -fsyntax-only %s 2>/dev/null \
+// RUN:            | FileCheck %s
+
+namespace test1 {
+
+struct A {
+  int a;
+};
+struct B {
+  int b;
+};
+struct C {};
+struct __declspec(align(16)) D {};
+struct __declspec(empty_bases) X : A, D, B, C {
+};
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::D (empty)
+// CHECK-NEXT:            | [sizeof=16, align=16,
+// CHECK-NEXT:            |  nvsize=0, nvalign=16]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::B
+// CHECK-NEXT:          0 |   int b
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::C (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test1::X
+// CHECK-NEXT:          0 |   struct test1::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          0 |   struct test1::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test1::C (base) (empty)
+// CHECK-NEXT:          4 |   struct test1::B (base)
+// CHECK-NEXT:          4 |     int b
+// CHECK-NEXT:            | [sizeof=16, align=16,
+// CHECK-NEXT:            |  nvsize=16, nvalign=16]
+
+int _ = sizeof(X);
+}
+
+namespace test2 {
+struct A {
+  int a;
+};
+struct __declspec(empty_bases) B {};
+struct C : A {
+  B b;
+};
+
+struct D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::C
+// CHECK-NEXT:          0 |   struct test2::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test2::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::F
+// CHECK-NEXT:          0 |   struct test2::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test2::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test2::G
+// CHECK-NEXT:          0 |   struct test2::C (base)
+// CHECK-NEXT:          0 |     struct test2::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test2::B b (empty)
+// CHECK-NEXT:          8 |   struct test2::F (base)
+// CHECK-NEXT:          8 |     struct test2::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test2::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
+
+namespace test3 {
+struct A {
+  int a;
+};
+struct B {};
+struct C : A {
+  B b;
+};
+
+struct D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct __declspec(empty_bases) G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::C
+// CHECK-NEXT:          0 |   struct test3::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test3::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::F
+// CHECK-NEXT:          0 |   struct test3::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test3::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test3::G
+// CHECK-NEXT:          0 |   struct test3::C (base)
+// CHECK-NEXT:          0 |     struct test3::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test3::B b (empty)
+// CHECK-NEXT:          8 |   struct test3::F (base)
+// CHECK-NEXT:          8 |     struct test3::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test3::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
+
+namespace test4 {
+struct A {
+  int a;
+};
+struct B {};
+struct C : A {
+  B b;
+};
+
+struct __declspec(empty_bases) D {};
+struct E {
+  int e;
+};
+struct F : D, E {};
+
+struct G : C, F {};
+
+int _ = sizeof(G);
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::A
+// CHECK-NEXT:          0 |   int a
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::B (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::C
+// CHECK-NEXT:          0 |   struct test4::A (base)
+// CHECK-NEXT:          0 |     int a
+// CHECK-NEXT:          4 |   struct test4::B b (empty)
+// CHECK-NEXT:            | [sizeof=8, align=4,
+// CHECK-NEXT:            |  nvsize=8, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::D (empty)
+// CHECK-NEXT:            | [sizeof=1, align=1,
+// CHECK-NEXT:            |  nvsize=0, nvalign=1]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::E
+// CHECK-NEXT:          0 |   int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::F
+// CHECK-NEXT:          0 |   struct test4::D (base) (empty)
+// CHECK-NEXT:          0 |   struct test4::E (base)
+// CHECK-NEXT:          0 |     int e
+// CHECK-NEXT:            | [sizeof=4, align=4,
+// CHECK-NEXT:            |  nvsize=4, nvalign=4]
+
+// CHECK: *** Dumping AST Record Layout
+// CHECK-NEXT:          0 | struct test4::G
+// CHECK-NEXT:          0 |   struct test4::C (base)
+// CHECK-NEXT:          0 |     struct test4::A (base)
+// CHECK-NEXT:          0 |       int a
+// CHECK-NEXT:          4 |     struct test4::B b (empty)
+// CHECK-NEXT:          8 |   struct test4::F (base)
+// CHECK-NEXT:          8 |     struct test4::D (base) (empty)
+// CHECK-NEXT:          8 |     struct test4::E (base)
+// CHECK-NEXT:          8 |       int e
+// CHECK-NEXT:            | [sizeof=12, align=4,
+// CHECK-NEXT:            |  nvsize=12, nvalign=4]
+}
diff --git a/test/Lexer/Inputs/case-insensitive-include.h b/test/Lexer/Inputs/case-insensitive-include.h
new file mode 100644
index 0000000000000..954090fc93b45
--- /dev/null
+++ b/test/Lexer/Inputs/case-insensitive-include.h
@@ -0,0 +1,8 @@
+#ifndef CASE_INSENSITIVE_INCLUDE_H
+#define CASE_INSENSITIVE_INCLUDE_H
+
+struct S {
+  int x;
+};
+
+#endif
diff --git a/test/Lexer/case-insensitive-include-ms.c b/test/Lexer/case-insensitive-include-ms.c
new file mode 100644
index 0000000000000..86bd8bba68ebd
--- /dev/null
+++ b/test/Lexer/case-insensitive-include-ms.c
@@ -0,0 +1,18 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/apath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only -fms-compatibility %s -include %s -I %T -verify
+// RUN: %clang_cc1 -fsyntax-only -fms-compatibility -fdiagnostics-parseable-fixits %s -include %s -I %T 2>&1 | FileCheck %s
+
+#include "..\Output\.\case-insensitive-include.h"
+#include "..\Output\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
+#include "..\output\.\case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"..\\Output\\.\\case-insensitive-include.h\""
+
+#include "apath\..\.\case-insensitive-include.h"
+#include "apath\..\.\Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath\\..\\.\\case-insensitive-include.h\""
+#include "APath\..\.\case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
diff --git a/test/Lexer/case-insensitive-include.c b/test/Lexer/case-insensitive-include.c
new file mode 100644
index 0000000000000..13e5b5994269d
--- /dev/null
+++ b/test/Lexer/case-insensitive-include.c
@@ -0,0 +1,35 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/apath
+// RUN: mkdir -p %T/asystempath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T
+// RUN: cp %S/Inputs/case-insensitive-include.h %T/asystempath/case-insensitive-include2.h
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -I %T -isystem %T/asystempath -verify
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s -include %s -I %T -isystem %T/asystempath 2>&1 | FileCheck %s
+
+// Known standard header, so warn:
+#include <StdDef.h> // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:20}:"<stddef.h>"
+
+#include "case-insensitive-include.h"
+#include "Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:38}:"\"case-insensitive-include.h\""
+
+#include "../Output/./case-insensitive-include.h"
+#include "../Output/./Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"../Output/./case-insensitive-include.h\""
+#include "../output/./case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:50}:"\"../Output/./case-insensitive-include.h\""
+
+#include "apath/.././case-insensitive-include.h"
+#include "apath/.././Case-Insensitive-Include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:49}:"\"apath/.././case-insensitive-include.h\""
+#include "APath/.././case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
+
+#include "../Output/./apath/.././case-insensitive-include.h"
+#include "../Output/./APath/.././case-insensitive-include.h" // For the sake of efficiency, this case is not diagnosed. :-(
+#include "../output/./apath/.././case-insensitive-include.h" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:61}:"\"../Output/./apath/.././case-insensitive-include.h\""
+
+#include "CASE-INSENSITIVE-INCLUDE2.H" // Found in an -isystem directory. No warning.
diff --git a/test/Lexer/case-insensitive-system-include.c b/test/Lexer/case-insensitive-system-include.c
new file mode 100644
index 0000000000000..9d5289cda2da2
--- /dev/null
+++ b/test/Lexer/case-insensitive-system-include.c
@@ -0,0 +1,10 @@
+// REQUIRES: case-insensitive-filesystem
+
+// RUN: mkdir -p %T/asystempath
+// RUN: cp %S/Inputs/case-insensitive-include.h %T/asystempath/
+// RUN: cd %T
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -isystem %T/asystempath -verify -Wnonportable-system-include-path
+// RUN: %clang_cc1 -fsyntax-only -fdiagnostics-parseable-fixits %s -include %s -isystem %T/asystempath -Wnonportable-system-include-path 2>&1 | FileCheck %s
+
+#include "CASE-INSENSITIVE-INCLUDE.H" // expected-warning {{non-portable path}}
+// CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:10-[[@LINE-1]]:38}:"\"case-insensitive-include.h\""
diff --git a/test/Lexer/cxx-features.cpp b/test/Lexer/cxx-features.cpp
index 6c4a092b1c48f..e047ec3dfb13b 100644
--- a/test/Lexer/cxx-features.cpp
+++ b/test/Lexer/cxx-features.cpp
@@ -1,132 +1,137 @@
 // RUN: %clang_cc1 -std=c++98 -verify %s
 // RUN: %clang_cc1 -std=c++11 -verify %s
 // RUN: %clang_cc1 -std=c++1y -fsized-deallocation -verify %s
-// RUN: %clang_cc1 -std=c++1y -fsized-deallocation -fconcepts-ts -DCONCEPTS_TS=1 -verify %s
+// RUN: %clang_cc1 -std=c++14 -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -verify %s
+// RUN: %clang_cc1 -std=c++1z -fsized-deallocation -fconcepts-ts -DCONCEPTS_TS=1 -verify %s
 // RUN: %clang_cc1 -fcoroutines -DCOROUTINES -verify %s
 
 // expected-no-diagnostics
 
+// FIXME using `defined` in a macro has undefined behavior.
 #if __cplusplus < 201103L
-#define check(macro, cxx98, cxx11, cxx1y) cxx98 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx98
-#elif __cplusplus < 201304L
-#define check(macro, cxx98, cxx11, cxx1y) cxx11 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx11
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx98 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx98
+#elif __cplusplus < 201402L
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx11 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx11
+#elif __cplusplus < 201406L
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx14 == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx14
 #else
-#define check(macro, cxx98, cxx11, cxx1y) cxx1y == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx1y
+#define check(macro, cxx98, cxx11, cxx14, cxx1z) cxx1z == 0 ? defined(__cpp_##macro) : __cpp_##macro != cxx1z
 #endif
 
-#if check(binary_literals, 0, 0, 201304)
+#if check(binary_literals, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_binary_literals"
 #endif
 
-#if check(digit_separators, 0, 0, 201309)
+#if check(digit_separators, 0, 0, 201309, 201309)
 #error "wrong value for __cpp_digit_separators"
 #endif
 
-#if check(init_captures, 0, 0, 201304)
+#if check(init_captures, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_init_captures"
 #endif
 
-#if check(generic_lambdas, 0, 0, 201304)
+#if check(generic_lambdas, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_generic_lambdas"
 #endif
 
-#if check(sized_deallocation, 0, 0, 201309)
+#if check(sized_deallocation, 0, 0, 201309, 201309)
 #error "wrong value for __cpp_sized_deallocation"
 #endif
 
-#if check(constexpr, 0, 200704, 201304)
+#if check(constexpr, 0, 200704, 201304, 201304)
 #error "wrong value for __cpp_constexpr"
 #endif
 
-#if check(decltype_auto, 0, 0, 201304)
+#if check(decltype_auto, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_decltype_auto"
 #endif
 
-#if check(return_type_deduction, 0, 0, 201304)
+#if check(return_type_deduction, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_return_type_deduction"
 #endif
 
-#if check(runtime_arrays, 0, 0, 0)
+#if check(runtime_arrays, 0, 0, 0, 0)
 #error "wrong value for __cpp_runtime_arrays"
 #endif
 
-#if check(aggregate_nsdmi, 0, 0, 201304)
+#if check(aggregate_nsdmi, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_aggregate_nsdmi"
 #endif
 
-#if check(variable_templates, 0, 0, 201304)
+#if check(variable_templates, 0, 0, 201304, 201304)
 #error "wrong value for __cpp_variable_templates"
 #endif
 
-#if check(unicode_characters, 0, 200704, 200704)
+#if check(unicode_characters, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_unicode_characters"
 #endif
 
-#if check(raw_strings, 0, 200710, 200710)
+#if check(raw_strings, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_raw_strings"
 #endif
 
-#if check(unicode_literals, 0, 200710, 200710)
+#if check(unicode_literals, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_unicode_literals"
 #endif
 
-#if check(user_defined_literals, 0, 200809, 200809)
+#if check(user_defined_literals, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_user_defined_literals"
 #endif
 
-#if check(lambdas, 0, 200907, 200907)
+#if check(lambdas, 0, 200907, 200907, 200907)
 #error "wrong value for __cpp_lambdas"
 #endif
 
-#if check(range_based_for, 0, 200907, 200907)
+#if check(range_based_for, 0, 200907, 200907, 200907)
 #error "wrong value for __cpp_range_based_for"
 #endif
 
-#if check(static_assert, 0, 200410, 200410)
+#if check(static_assert, 0, 200410, 200410, 200410)
 #error "wrong value for __cpp_static_assert"
 #endif
 
-#if check(decltype, 0, 200707, 200707)
+#if check(decltype, 0, 200707, 200707, 200707)
 #error "wrong value for __cpp_decltype"
 #endif
 
-#if check(attributes, 0, 200809, 200809)
+#if check(attributes, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_attributes"
 #endif
 
-#if check(rvalue_references, 0, 200610, 200610)
+#if check(rvalue_references, 0, 200610, 200610, 200610)
 #error "wrong value for __cpp_rvalue_references"
 #endif
 
-#if check(variadic_templates, 0, 200704, 200704)
+#if check(variadic_templates, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_variadic_templates"
 #endif
 
-#if check(initializer_lists, 0, 200806, 200806)
+#if check(initializer_lists, 0, 200806, 200806, 200806)
 #error "wrong value for __cpp_initializer_lists"
 #endif
 
-#if check(delegating_constructors, 0, 200604, 200604)
+#if check(delegating_constructors, 0, 200604, 200604, 200604)
 #error "wrong value for __cpp_delegating_constructors"
 #endif
 
-#if check(nsdmi, 0, 200809, 200809)
+#if check(nsdmi, 0, 200809, 200809, 200809)
 #error "wrong value for __cpp_nsdmi"
 #endif
 
-#if check(inheriting_constructors, 0, 200802, 200802)
+#if check(inheriting_constructors, 0, 200802, 200802, 200802)
 #error "wrong value for __cpp_inheriting_constructors"
 #endif
 
-#if check(ref_qualifiers, 0, 200710, 200710)
+#if check(ref_qualifiers, 0, 200710, 200710, 200710)
 #error "wrong value for __cpp_ref_qualifiers"
 #endif
 
-#if check(alias_templates, 0, 200704, 200704)
+#if check(alias_templates, 0, 200704, 200704, 200704)
 #error "wrong value for __cpp_alias_templates"
 #endif
 
-#if check(experimental_concepts, 0, 0, CONCEPTS_TS)
+#if check(experimental_concepts, 0, 0, CONCEPTS_TS, CONCEPTS_TS)
 #error "wrong value for __cpp_experimental_concepts"
 #endif
 
diff --git a/test/Lexer/cxx1y_digit_separators.cpp b/test/Lexer/cxx1y_digit_separators.cpp
index c4c6aee963da0..55366342eda0d 100644
--- a/test/Lexer/cxx1y_digit_separators.cpp
+++ b/test/Lexer/cxx1y_digit_separators.cpp
@@ -48,6 +48,9 @@ namespace floating {
   float r = 0.'0e1; // expected-error {{digit separator cannot appear at start of digit sequence}}
   float s = 0.0'e1; // expected-error {{digit separator cannot appear at end of digit sequence}}
   float t = 0.0e'1; // expected-error {{digit separator cannot appear at start of digit sequence}}
+  float u = 0x.'p1f; // expected-error {{hexadecimal floating literal requires a significand}}
+  float v = 0e'f; // expected-error {{exponent has no digits}}
+  float w = 0x0p'f; // expected-error {{exponent has no digits}}
 }
 
 #line 123'456
diff --git a/test/Lexer/eof-conflict-marker.c b/test/Lexer/eof-conflict-marker.c
new file mode 100644
index 0000000000000..e0c35401ccbf6
--- /dev/null
+++ b/test/Lexer/eof-conflict-marker.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only
+// vim: set binary noeol:
+
+// This file intentionally ends without a \n on the last line.  Make sure your
+// editor doesn't add one.
+
+>>>> ORIGINAL
+// expected-error@-1 {{version control conflict marker in file}}
+<<<<
+// expected-error@-1 {{expected identifier or '('}}
+<<<<
+\ No newline at end of file
diff --git a/test/Lexer/half-literal.cpp b/test/Lexer/half-literal.cpp
new file mode 100644
index 0000000000000..32af3faef9aa1
--- /dev/null
+++ b/test/Lexer/half-literal.cpp
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic %s
+float a = 1.0h; // expected-error{{invalid suffix 'h' on floating constant}}
+float b = 1.0H; // expected-error{{invalid suffix 'H' on floating constant}}
diff --git a/test/Lexer/has_feature_efficiency_sanitizer.cpp b/test/Lexer/has_feature_efficiency_sanitizer.cpp
new file mode 100644
index 0000000000000..ef9e273602558
--- /dev/null
+++ b/test/Lexer/has_feature_efficiency_sanitizer.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -E -fsanitize=efficiency-cache-frag %s -o - | FileCheck --check-prefix=CHECK-ESAN %s
+// RUN: %clang_cc1 -E -fsanitize=efficiency-working-set %s -o - | FileCheck --check-prefix=CHECK-ESAN %s
+// RUN: %clang_cc1 -E  %s -o - | FileCheck --check-prefix=CHECK-NO-ESAN %s
+
+#if __has_feature(efficiency_sanitizer)
+int EfficiencySanitizerEnabled();
+#else
+int EfficiencySanitizerDisabled();
+#endif
+
+// CHECK-ESAN: EfficiencySanitizerEnabled
+// CHECK-NO-ESAN: EfficiencySanitizerDisabled
diff --git a/test/Lexer/hexfloat.cpp b/test/Lexer/hexfloat.cpp
index 6985c7fbe2a49..163db72f56f29 100644
--- a/test/Lexer/hexfloat.cpp
+++ b/test/Lexer/hexfloat.cpp
@@ -1,15 +1,31 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -pedantic %s
+// RUN: %clang_cc1 -std=c++98 -fsyntax-only -verify -pedantic %s
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -pedantic %s
-float f = 0x1p+1; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-double e = 0x.p0; //expected-error{{hexadecimal floating constants require a significand}}
-double d = 0x.2p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-float g = 0x1.2p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
-double h = 0x1.p2; // expected-warning{{hexadecimal floating constants are a C99 feature}}
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify -pedantic %s
+// RUN: %clang_cc1 -std=c++1z -fsyntax-only -verify -pedantic %s
+double e = 0x.p0; // expected-error-re {{hexadecimal floating {{constant|literal}} requires a significand}}
+
+float f = 0x1p+1;
+double d = 0x.2p2;
+float g = 0x1.2p2;
+double h = 0x1.p2;
+#if __cplusplus <= 201402L
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+// expected-warning@-5 {{hexadecimal floating literals are a C++1z feature}}
+#endif
 
 // PR12717: In order to minimally diverge from the C++ standard, we do not lex
 // 'p[+-]' as part of a pp-number unless the token starts 0x and doesn't contain
 // an underscore.
-double i = 0p+3; // expected-error{{invalid suffix 'p' on integer constant}}
+double i = 0p+3; // expected-error {{invalid suffix 'p' on integer constant}}
 #define PREFIX(x) foo ## x
 double foo0p = 1, j = PREFIX(0p+3); // ok
-double k = 0x42_amp+3; // expected-error-re{{{{invalid suffix '_amp' on integer constant|no matching literal operator for call to 'operator""_amp'}}}}
+double k = 0x42_amp+3;
+#if __cplusplus > 201402L
+// expected-error@-2 {{no matching literal operator for call to 'operator""_amp+3'}}
+#elif __cplusplus >= 201103L
+// expected-error@-4 {{no matching literal operator for call to 'operator""_amp'}}
+#else
+// expected-error@-6 {{invalid suffix '_amp' on integer constant}}
+#endif
diff --git a/test/Lexer/opencl-half-literal.cl b/test/Lexer/opencl-half-literal.cl
new file mode 100644
index 0000000000000..42ca5146b1fb5
--- /dev/null
+++ b/test/Lexer/opencl-half-literal.cl
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple spir-unknown-unknown
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+constant half a = 1.0h; 
+constant half aa = 1.0H;
+constant half b = 1.0hh; // expected-error{{invalid suffix 'hh' on floating constant}}
+constant half c = 1.0fh; // expected-error{{invalid suffix 'fh' on floating constant}}
+constant half d = 1.0lh; // expected-error{{invalid suffix 'lh' on floating constant}}
+constant half e = 1.0hf; // expected-error{{invalid suffix 'hf' on floating constant}}
diff --git a/test/Makefile b/test/Makefile
deleted file mode 100644
index 5cb8a8b71a419..0000000000000
--- a/test/Makefile
+++ /dev/null
@@ -1,77 +0,0 @@
-CLANG_LEVEL := ..
-include $(CLANG_LEVEL)/Makefile
-
-# Test in all immediate subdirectories if unset.
-ifdef TESTSUITE
-TESTDIRS := $(TESTSUITE:%=$(PROJ_SRC_DIR)/%)
-else
-TESTDIRS ?= $(PROJ_SRC_DIR)
-endif
-
-# 'lit' wants objdir paths, so it will pick up the lit.site.cfg.
-TESTDIRS := $(TESTDIRS:$(PROJ_SRC_DIR)%=$(PROJ_OBJ_DIR)%)
-
-# Allow EXTRA_TESTDIRS to provide additional test directories.
-TESTDIRS += $(EXTRA_TESTDIRS)
-
-ifndef TESTARGS
-ifdef VERBOSE
-TESTARGS = -v
-else
-TESTARGS = -s -v
-endif
-endif
-
-# Make sure any extra test suites can find the main site config.
-LIT_ARGS := --param clang_site_config=$(PROJ_OBJ_DIR)/lit.site.cfg
-
-ifdef VG
-  LIT_ARGS += "--vg"
-endif
-
-all:: lit.site.cfg Unit/lit.site.cfg
-	@ echo '--- Running clang tests for $(TARGET_TRIPLE) ---'
-	@ $(PYTHON) $(LLVM_SRC_ROOT)/utils/lit/lit.py \
-	  $(LIT_ARGS) $(TESTARGS) $(TESTDIRS)
-
-FORCE:
-
-lit.site.cfg: FORCE
-	@echo "Making Clang 'lit.site.cfg' file..."
-	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > lit.tmp
-	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_LIBS_DIR@=$(LibDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_SOURCE_DIR@=$(PROJ_SRC_DIR)/..=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_BINARY_DIR@=$(PROJ_OBJ_DIR)/..=g >> lit.tmp
-	@$(ECHOPATH) s=@CLANG_TOOLS_DIR@=$(ToolDir)=g >> lit.tmp
-	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g >> lit.tmp
-	@$(ECHOPATH) s=@LLVM_HOST_TRIPLE@=$(HOST_TRIPLE)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_ARCMT@=$(ENABLE_CLANG_ARCMT)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_STATIC_ANALYZER@=$(ENABLE_CLANG_STATIC_ANALYZER)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_CLANG_EXAMPLES@=$(ENABLE_CLANG_EXAMPLES)=g >> lit.tmp
-	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
-	@sed -f lit.tmp $(PROJ_SRC_DIR)/lit.site.cfg.in > $@
-	@-rm -f lit.tmp
-
-Unit/lit.site.cfg: FORCE
-	@echo "Making Clang 'Unit/lit.site.cfg' file..."
-	@$(MKDIR) $(dir $@)
-	@$(ECHOPATH) s=@LLVM_SOURCE_DIR@=$(LLVM_SRC_ROOT)=g > unit.tmp
-	@$(ECHOPATH) s=@LLVM_BINARY_DIR@=$(LLVM_OBJ_ROOT)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_TOOLS_DIR@=$(ToolDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_LIBS_DIR@=$(LibDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@CLANG_SOURCE_DIR@=$(PROJ_SRC_DIR)/..=g >> unit.tmp
-	@$(ECHOPATH) s=@CLANG_BINARY_DIR@=$(PROJ_OBJ_DIR)/..=g >> unit.tmp
-	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g >> unit.tmp
-	@$(ECHOPATH) s=@LLVM_BUILD_MODE@=$(BuildMode)=g >> unit.tmp
-	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> unit.tmp
-	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> unit.tmp
-	@$(ECHOPATH) s=@SHLIBPATH_VAR@=$(SHLIBPATH_VAR)=g >> unit.tmp
-	@sed -f unit.tmp $(PROJ_SRC_DIR)/Unit/lit.site.cfg.in > $@
-	@-rm -f unit.tmp
-
-clean::
-	@ find . -name Output | xargs rm -fr
-
-.PHONY: all report clean
diff --git a/test/Misc/amdgcn.languageOptsOpenCL.cl b/test/Misc/amdgcn.languageOptsOpenCL.cl
new file mode 100644
index 0000000000000..3d1f9b4d98266
--- /dev/null
+++ b/test/Misc/amdgcn.languageOptsOpenCL.cl
@@ -0,0 +1,200 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple amdgcn-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple amdgcn-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifndef cl_khr_fp16
+#error "Missing cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+
+#ifndef cl_khr_int64_base_atomics
+#error "Missing cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+
+#ifndef cl_khr_int64_extended_atomics
+#error "Missing cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+
+#ifdef cl_khr_gl_sharing
+#error "Incorrect cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_sharing' - ignoring}}
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+//Core feature in CL 2.0
+#ifndef cl_khr_3d_image_writes
+#error "Missing cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+#if (__OPENCL_C_VERSION__ >= 200) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_3d_image_writes' is core feature or supported optional core feature - ignoring}}
+#endif
+
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Incorrect cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
diff --git a/test/Misc/ast-dump-color.cpp b/test/Misc/ast-dump-color.cpp
index e93274e33acbb..852d689794001 100644
--- a/test/Misc/ast-dump-color.cpp
+++ b/test/Misc/ast-dump-color.cpp
@@ -34,7 +34,7 @@ struct Invalid {
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]TypedefDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]<invalid sloc>[[RESET]]> [[Yellow]]<invalid sloc>[[RESET]] implicit[[CYAN]] __uint128_t[[RESET]] [[Green]]'unsigned __int128'[[RESET]]{{$}}
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]TypedefDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]<invalid sloc>[[RESET]]> [[Yellow]]<invalid sloc>[[RESET]] implicit[[CYAN]] __builtin_va_list[[RESET]] [[Green]]'struct __va_list_tag [1]'[[RESET]]{{$}}
 //CHECK: {{^}}[[Blue]]|-[[RESET]][[GREEN]]VarDecl[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]{{.*}}ast-dump-color.cpp:6:1[[RESET]], [[Yellow]]col:5[[RESET]]> [[Yellow]]col:5[[RESET]][[CYAN]] Test[[RESET]] [[Green]]'int'[[RESET]]
-//CHECK: {{^}}[[Blue]]| |-[[RESET]][[BLUE:.\[0;1;34m]]UnusedAttr[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]col:25[[RESET]]>{{$}}
+//CHECK: {{^}}[[Blue]]| |-[[RESET]][[BLUE:.\[0;1;34m]]UnusedAttr[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]col:25[[RESET]]> unused{{$}}
 //CHECK: {{^}}[[Blue]]| `-[[RESET]][[Blue]]FullComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]], [[Yellow]]line:5:8[[RESET]]>{{$}}
 //CHECK: {{^}}[[Blue]]|   `-[[RESET]][[Blue]]ParagraphComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]], [[Yellow]]line:5:8[[RESET]]>{{$}}
 //CHECK: {{^}}[[Blue]]|     |-[[RESET]][[Blue]]TextComment[[RESET]][[Yellow]] 0x{{[0-9a-fA-F]*}}[[RESET]] <[[Yellow]]line:4:4[[RESET]]> Text=" "{{$}}
diff --git a/test/Misc/ast-dump-decl.mm b/test/Misc/ast-dump-decl.mm
index 06ab5155110c4..be245f7ef5cdf 100644
--- a/test/Misc/ast-dump-decl.mm
+++ b/test/Misc/ast-dump-decl.mm
@@ -21,3 +21,13 @@
 // CHECK-NEXT:     CXXConstructExpr
 // CHECK-NEXT:   ObjCIvarDecl{{.*}} X
 // CHECK-NEXT:   ObjCMethodDecl{{.*}} foo
+
+// @() boxing expressions.
+template <typename T>
+struct BoxingTest {
+  static id box(T value) {
+    return @(value);
+  }
+};
+
+// CHECK: ObjCBoxedExpr{{.*}} '<dependent type>'{{$}}
diff --git a/test/Misc/ast-dump-invalid.cpp b/test/Misc/ast-dump-invalid.cpp
index 7b02ba111339e..aa6cd526929ff 100644
--- a/test/Misc/ast-dump-invalid.cpp
+++ b/test/Misc/ast-dump-invalid.cpp
@@ -34,6 +34,7 @@ int g(int i) {
 // CHECK-NEXT:   `-CompoundStmt
 // CHECK-NEXT:     `-IfStmt {{.*}} <line:25:3, line:28:12>
 // CHECK-NEXT:       |-<<<NULL>>>
+// CHECK-NEXT:       |-<<<NULL>>>
 // CHECK-NEXT:       |-OpaqueValueExpr {{.*}} <<invalid sloc>> '_Bool'
 // CHECK-NEXT:       |-ReturnStmt {{.*}} <line:26:5, col:12>
 // CHECK-NEXT:       | `-IntegerLiteral {{.*}} <col:12> 'int' 4
@@ -41,3 +42,23 @@ int g(int i) {
 // CHECK-NEXT:         `-ImplicitCastExpr {{.*}} <col:12> 'int' <LValueToRValue>
 // CHECK-NEXT:           `-DeclRefExpr {{.*}} <col:12> 'int' lvalue ParmVar {{.*}} 'i' 'int'
 
+namespace TestInvalidFunctionDecl {
+struct Str {
+   double foo1(double, invalid_type);
+};
+double Str::foo1(double, invalid_type)
+{ return 45; }
+}
+// CHECK: NamespaceDecl {{.*}} <{{.*}}> {{.*}} TestInvalidFunctionDecl
+// CHECK-NEXT: |-CXXRecordDecl {{.*}} <line:46:1, line:48:1> line:46:8 struct Str definition
+// CHECK-NEXT: | |-CXXRecordDecl {{.*}} <col:1, col:8> col:8 implicit struct Str
+// CHECK-NEXT: | `-CXXMethodDecl {{.*}} <line:47:4, col:36> col:11 invalid foo1 'double (double, int)'
+// CHECK-NEXT: |   |-ParmVarDecl {{.*}} <col:16> col:22 'double'
+// CHECK-NEXT: |   `-ParmVarDecl {{.*}} <col:24, <invalid sloc>> col:36 invalid 'int'
+// CHECK-NEXT: `-CXXMethodDecl {{.*}} parent {{.*}} <line:49:1, line:50:14> line:49:13 invalid foo1 'double (double, int)'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <col:18> col:24 'double'
+// CHECK-NEXT:   |-ParmVarDecl {{.*}} <col:26, <invalid sloc>> col:38 invalid 'int'
+// CHECK-NEXT:   `-CompoundStmt {{.*}} <line:50:1, col:14>
+// CHECK-NEXT:     `-ReturnStmt {{.*}} <col:3, col:10>
+// CHECK-NEXT:       `-ImplicitCastExpr {{.*}} <col:10> 'double' <IntegralToFloating>
+// CHECK-NEXT:         `-IntegerLiteral {{.*}} <col:10> 'int' 45
diff --git a/test/Misc/ast-dump-pipe.cl b/test/Misc/ast-dump-pipe.cl
new file mode 100644
index 0000000000000..1690e5c17a053
--- /dev/null
+++ b/test/Misc/ast-dump-pipe.cl
@@ -0,0 +1,4 @@
+// RUN: %clang_cc1 -triple spir64 -cl-std=CL2.0 -ast-dump -ast-dump-filter pipetype %s | FileCheck -strict-whitespace %s
+typedef pipe int pipetype;
+// CHECK:      PipeType {{.*}} 'pipe int'
+// CHECK-NEXT:   BuiltinType {{.*}} 'int'
diff --git a/test/Misc/ast-print-char-literal.cpp b/test/Misc/ast-print-char-literal.cpp
index bb5daa2444da9..614b3ca9d73cb 100644
--- a/test/Misc/ast-print-char-literal.cpp
+++ b/test/Misc/ast-print-char-literal.cpp
@@ -13,6 +13,8 @@ void i() {
   h<u8'2'>();
 }
 
+char j = '\xFF';
+
 // CHECK: char c = u8'1';
 // CHECK-NEXT: char d = '1';
 // CHECK-NEXT: char e = U'1';
@@ -22,3 +24,4 @@ void i() {
 // CHECK: template <char c = u8'1'>
 
 // CHECK: h<u8'2'>();
+// CHECK: char j = '\xff';
diff --git a/test/Misc/ast-print-objectivec.m b/test/Misc/ast-print-objectivec.m
index ef0fcaa9e32f6..6293b43f55e42 100644
--- a/test/Misc/ast-print-objectivec.m
+++ b/test/Misc/ast-print-objectivec.m
@@ -20,22 +20,22 @@
 @end
 
 // CHECK: @protocol P
-// CHECK: - (void) MethP __attribute__((availability(macosx, introduced=10.1.0, deprecated=10.2)));
+// CHECK: - (void) MethP __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)));
 // CHECK: @end
 
 // CHECK: @interface I : NSObject<P> 
-// CHECK: - (void) MethI __attribute__((availability(macosx, introduced=10.1.0, deprecated=10.2)));
+// CHECK: - (void) MethI __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2)));
 // CHECK: @end
 
 // CHECK: @interface I(CAT)
-// CHECK: - (void) MethCAT __attribute__((availability(macosx, introduced=10_1_0, deprecated=10_2)));
+// CHECK: - (void) MethCAT __attribute__((availability(macos, introduced=10_1_0, deprecated=10_2)));
 // CHECK: @end
 
 // CHECK: @implementation I
-// CHECK: - (void) MethP __attribute__((availability(macosx, introduced=10.1.0, deprecated=10.2))) {
+// CHECK: - (void) MethP __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2))) {
 // CHECK: }
 
-// CHECK: - (void) MethI __attribute__((availability(macosx, introduced=10.1.0, deprecated=10.2))) {
+// CHECK: - (void) MethI __attribute__((availability(macos, introduced=10.1.0, deprecated=10.2))) {
 // CHECK: }
 
 // CHECK: @end
diff --git a/test/Misc/ast-print-pragmas.cpp b/test/Misc/ast-print-pragmas.cpp
index c4fe1e23b1e62..5840c1a2c3bec 100644
--- a/test/Misc/ast-print-pragmas.cpp
+++ b/test/Misc/ast-print-pragmas.cpp
@@ -19,7 +19,9 @@ void test(int *List, int Length) {
 
 // CHECK: #pragma clang loop interleave(disable)
 // CHECK-NEXT: #pragma clang loop vectorize(enable)
+// CHECK-NEXT: #pragma clang loop distribute(disable)
 
+#pragma clang loop distribute(disable)
 #pragma clang loop vectorize(enable)
 #pragma clang loop interleave(disable)
 // CHECK-NEXT: while (i - 1 < Length)
@@ -30,7 +32,9 @@ void test(int *List, int Length) {
 
 // CHECK: #pragma clang loop interleave(enable)
 // CHECK-NEXT: #pragma clang loop vectorize(disable)
+// CHECK-NEXT: #pragma clang loop distribute(enable)
 
+#pragma clang loop distribute(enable)
 #pragma clang loop vectorize(disable)
 #pragma clang loop interleave(enable)
 // CHECK-NEXT: while (i - 2 < Length)
diff --git a/test/Misc/backend-optimization-failure-nodbg.cpp b/test/Misc/backend-optimization-failure-nodbg.cpp
index 3c32646014248..1e847185e81ca 100644
--- a/test/Misc/backend-optimization-failure-nodbg.cpp
+++ b/test/Misc/backend-optimization-failure-nodbg.cpp
@@ -4,7 +4,7 @@
 // Test verifies optimization failures generated by the backend are handled
 // correctly by clang. LLVM tests verify all of the failure conditions.
 
-void test_switch(int *A, int *B, int Length) {
+void test_switch(int *A, int *B, int Length) { /* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */
 #pragma clang loop vectorize(enable) unroll(disable)
   for (int i = 0; i < Length; i++) {
     switch (A[i]) {
@@ -18,4 +18,4 @@ void test_switch(int *A, int *B, int Length) {
       B[i] = 3;
     }
   }
-/* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */ }
+}
diff --git a/test/Misc/backend-optimization-failure.cpp b/test/Misc/backend-optimization-failure.cpp
index c0f3bf46f081b..bb50e96aaa578 100644
--- a/test/Misc/backend-optimization-failure.cpp
+++ b/test/Misc/backend-optimization-failure.cpp
@@ -7,7 +7,7 @@
 void test_switch(int *A, int *B, int Length) {
 #pragma clang loop vectorize(enable) unroll(disable)
   for (int i = 0; i < Length; i++) {
-/* expected-warning {{loop not vectorized: failed explicitly specified loop vectorization}} */ switch (A[i]) {
+/* expected-warning@-1 {{loop not vectorized: failed explicitly specified loop vectorization}} */ switch (A[i]) {
     case 0:
       B[i] = 1;
       break;
diff --git a/test/Misc/backend-resource-limit-diagnostics.cl b/test/Misc/backend-resource-limit-diagnostics.cl
new file mode 100644
index 0000000000000..6e7619babe83b
--- /dev/null
+++ b/test/Misc/backend-resource-limit-diagnostics.cl
@@ -0,0 +1,9 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: not %clang_cc1 -emit-codegen-only -triple=amdgcn-- %s 2>&1 | FileCheck %s
+
+// CHECK: error: local memory limit exceeded (480000) in use_huge_lds
+kernel void use_huge_lds()
+{
+    volatile local int huge[120000];
+    huge[0] = 2;
+}
diff --git a/test/Misc/diag-format.c b/test/Misc/diag-format.c
index 8e30cf76afd2a..d34d25a3cf940 100644
--- a/test/Misc/diag-format.c
+++ b/test/Misc/diag-format.c
@@ -4,27 +4,27 @@
 //
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300  %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00  %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc  %s 2>&1 | FileCheck %s -check-prefix=MSVC
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1300 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1800 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2013
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1900 -target x86_64-pc-win32 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fms-compatibility-version=13.00 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2010
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1800 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2013
 // RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fmsc-version=1900 -target x86_64-pc-win32 -fshow-column %s 2>&1 | FileCheck %s -check-prefix=MSVC2015
 //
 // RUN: %clang -fsyntax-only -fdiagnostics-format=vi    %s 2>&1 | FileCheck %s -check-prefix=VI
 //
-// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fno-show-column %s 2>&1 | FileCheck %s -check-prefix=MSVC_ORIG
+// RUN: %clang -fsyntax-only -fdiagnostics-format=msvc -fno-show-column -fmsc-version=1900 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015_ORIG
 //
 // RUN: %clang -fsyntax-only -fno-show-column %s 2>&1 | FileCheck %s -check-prefix=NO_COLUMN
 //
 // RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1300 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010-FALLBACK
 // RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fms-compatibility-version=13.00 %s 2>&1 | FileCheck %s -check-prefix=MSVC2010-FALLBACK
-// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback %s 2>&1 | FileCheck %s -check-prefix=MSVC-FALLBACK
-
-
-
-
+// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1800 %s 2>&1 | FileCheck %s -check-prefix=MSVC2013-FALLBACK
+// RUN: not %clang -fsyntax-only -Werror -fdiagnostics-format=msvc-fallback -fmsc-version=1900 %s 2>&1 | FileCheck %s -check-prefix=MSVC2015-FALLBACK
 
 
 
@@ -36,10 +36,13 @@
 #endif bad // extension!
 // DEFAULT: {{.*}}:36:8: warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // MSVC2010: {{.*}}(36,7) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
-// MSVC: {{.*}}(36,8) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2013: {{.*}}(36,8) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC: {{.*}}(36,8){{ ?}}: warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2015: {{.*}}(36,8): warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // VI: {{.*}} +36:8: warning: extra tokens at end of #endif directive [-Wextra-tokens]
-// MSVC_ORIG: {{.*}}(36) : warning: extra tokens at end of #endif directive [-Wextra-tokens]
+// MSVC2015_ORIG: {{.*}}(36): warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // NO_COLUMN: {{.*}}:36: warning: extra tokens at end of #endif directive [-Wextra-tokens]
 // MSVC2010-FALLBACK: {{.*}}(36,7) : error(clang): extra tokens at end of #endif directive
-// MSVC-FALLBACK: {{.*}}(36,8) : error(clang): extra tokens at end of #endif directive
+// MSVC2013-FALLBACK: {{.*}}(36,8) : error(clang): extra tokens at end of #endif directive
+// MSVC2015-FALLBACK: {{.*}}(36,8): error(clang): extra tokens at end of #endif directive
 int x;
diff --git a/test/Misc/diag-null-bytes-in-line.cpp b/test/Misc/diag-null-bytes-in-line.cpp
new file mode 100644
index 0000000000000..1eba91f6b2392
--- /dev/null
+++ b/test/Misc/diag-null-bytes-in-line.cpp
diff --git a/test/Misc/diag-template-diffing-color.cpp b/test/Misc/diag-template-diffing-color.cpp
index bf203153d8d41..2010344e79103 100644
--- a/test/Misc/diag-template-diffing-color.cpp
+++ b/test/Misc/diag-template-diffing-color.cpp
@@ -34,42 +34,38 @@ void set16(vector<vector<int> >) {}
 void test16() {
   set16(vector<const vector<int> >());
 }
-// CHECK: {{.*}}candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' to 'vector<vector<[...]>>' for 1st argument
+// CHECK: {{.*}}candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' to 'vector<vector<...>>' for 1st argument
 // TREE: {{.*}}candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]](no qualifiers){{ ?}}[[RESET]]]{{ ?}}vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]](no qualifiers){{ ?}}[[RESET]]]{{ ?}}vector<...>>
 
 void set17(vector<const vector<int> >) {}
 void test17() {
   set17(vector<vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<vector<[...]>>' to 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<vector<...>>' to 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]](no qualifiers){{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]const[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]](no qualifiers){{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]const[[RESET]]] vector<...>>
 
 void set18(vector<volatile vector<int> >) {}
 void test18() {
   set18(vector<const vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' to 'vector<[[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}vector<...>>' to 'vector<[[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: no matching function for call to 'set18'
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]volatile[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     {{\[}}[[CYAN]]const{{ ?}}[[RESET]]{{ ?}}!= [[CYAN]]volatile[[RESET]]] vector<...>>
 
 void set19(vector<const volatile vector<int> >) {}
 void test19() {
   set19(vector<const vector<int> >());
 }
-// CHECK: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<const [[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<[...]>>' for 1st argument
+// CHECK: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<const [[CYAN]]volatile{{ ?}}[[RESET]]{{ ?}}vector<...>>' for 1st argument
 // TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // TREE:   vector<
-// TREE:     [const != const [[CYAN]]volatile[[RESET]]] vector<
-// TREE:       [...]>>
+// TREE:     [const != const [[CYAN]]volatile[[RESET]]] vector<...>>
 
 namespace default_args {
   template <int x, int y = 1+1, int z = 2>
diff --git a/test/Misc/diag-template-diffing-cxx98.cpp b/test/Misc/diag-template-diffing-cxx98.cpp
index 9fa46127920cf..7b1a08c6b8691 100644
--- a/test/Misc/diag-template-diffing-cxx98.cpp
+++ b/test/Misc/diag-template-diffing-cxx98.cpp
@@ -45,5 +45,5 @@ namespace qualifiers {
     foo(bar, V);
   }
 
-  // CHECK: candidate template ignored: deduced conflicting types for parameter 'T' ('const vector<[...]>' vs. 'volatile vector<[...]>')
+  // CHECK: candidate template ignored: deduced conflicting types for parameter 'T' ('const vector<...>' vs. 'volatile vector<...>')
 }
diff --git a/test/Misc/diag-template-diffing.cpp b/test/Misc/diag-template-diffing.cpp
index 70d5e7c87091c..90bcf6b2d17e2 100644
--- a/test/Misc/diag-template-diffing.cpp
+++ b/test/Misc/diag-template-diffing.cpp
@@ -479,14 +479,13 @@ void test17() {
   set17(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set17'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set17'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set17'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != (no qualifiers)] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != (no qualifiers)] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set17'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -498,14 +497,13 @@ void test18() {
   set18(vector<vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set18'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<[...]>>' to 'vector<const vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<...>>' to 'vector<const vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set18'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<vector<int>>' to 'vector<const vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set18'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [(no qualifiers) != const] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [(no qualifiers) != const] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set18'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -517,14 +515,13 @@ void test19() {
   set19(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set19'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<volatile vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<volatile vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set19'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<volatile vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set19'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != volatile] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != volatile] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set19'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -536,14 +533,13 @@ void test20() {
   set20(vector<const vector<int>>());
 }
 // CHECK-ELIDE-NOTREE: no matching function for call to 'set20'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<[...]>>' to 'vector<const volatile vector<[...]>>' for 1st argument
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<...>>' to 'vector<const volatile vector<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'set20'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<const vector<int>>' to 'vector<const volatile vector<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'set20'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:   vector<
-// CHECK-ELIDE-TREE:     [const != const volatile] vector<
-// CHECK-ELIDE-TREE:       [...]>>
+// CHECK-ELIDE-TREE:     [const != const volatile] vector<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'set20'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:   vector<
@@ -557,14 +553,13 @@ template<typename T> using U21 = volatile S21<T>;
 int f21(vector<const U21<int>>);
 int k21 = f21(vector<U21<int>>());
 // CHECK-ELIDE-NOTREE: no matching function for call to 'f21'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<[...]>>' to 'vector<const U21<[...]>>' for 1st argument 
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<...>>' to 'vector<const U21<...>>' for 1st argument 
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'f21'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U21<int>>' to 'vector<const U21<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'f21'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:    vector<
-// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U21<
-// CHECK-ELIDE-TREE:        [...]>>
+// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U21<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'f21'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:    vector<
@@ -577,14 +572,13 @@ template<typename T> using U22 = volatile S22<T>;
 int f22(vector<volatile const U22<int>>);
 int k22 = f22(vector<volatile U22<int>>());
 // CHECK-ELIDE-NOTREE: no matching function for call to 'f22'
-// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<[...]>>' to 'vector<const U22<[...]>>' for 1st argument 
+// CHECK-ELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<...>>' to 'vector<const U22<...>>' for 1st argument
 // CHECK-NOELIDE-NOTREE: no matching function for call to 'f22'
 // CHECK-NOELIDE-NOTREE: candidate function not viable: no known conversion from 'vector<U22<int>>' to 'vector<const U22<int>>' for 1st argument
 // CHECK-ELIDE-TREE: no matching function for call to 'f22'
 // CHECK-ELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-ELIDE-TREE:    vector<
-// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U22<
-// CHECK-ELIDE-TREE:        [...]>>
+// CHECK-ELIDE-TREE:      [(no qualifiers) != const] U22<...>>
 // CHECK-NOELIDE-TREE: no matching function for call to 'f22'
 // CHECK-NOELIDE-TREE: candidate function not viable: no known conversion from argument type to parameter type for 1st argument
 // CHECK-NOELIDE-TREE:    vector<
@@ -1258,7 +1252,7 @@ using T = condition<(is_const())>;
 void foo(const T &t) {
   T &t2 = t;
 }
-// CHECK-ELIDE-NOTREE: binding value of type 'const condition<[...]>' to reference to type 'condition<[...]>' drops 'const' qualifier
+// CHECK-ELIDE-NOTREE: binding value of type 'const condition<...>' to reference to type 'condition<...>' drops 'const' qualifier
 }
 
 namespace BoolArgumentBitExtended {
@@ -1390,7 +1384,7 @@ namespace DefaultNonTypeArgWithDependentType {
 template <typename SizeType = int, SizeType = 0> struct A {};
 template <typename R = A<>> R bar();
 A<> &foo() { return bar(); }
-// CHECK-ELIDE-NOTREE: error: non-const lvalue reference to type 'A<[2 * ...]>' cannot bind to a temporary of type 'A<[2 * ...]>'
+// CHECK-ELIDE-NOTREE: error: non-const lvalue reference to type 'A<...>' cannot bind to a temporary of type 'A<...>'
 // CHECK-NOELIDE-NOTREE: error: non-const lvalue reference to type 'A<int, 0>' cannot bind to a temporary of type 'A<int, 0>'
 }
 
@@ -1423,8 +1417,45 @@ B<const A<>> b4 = B<>();
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'A<1>' to 'A<(default) 0>'
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<int>' to 'B<(default) ZeroArgs::A<0>>'
 // CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<(default) ZeroArgs::A<0>>' to 'B<int>'
-// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<const A<[...]>>' to 'B<A<[...]>>'
-// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<A<[...]>>' to 'B<const A<[...]>>'
+// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<const A<...>>' to 'B<A<...>>'
+// CHECK-ELIDE-NOTREE: error: no viable conversion from 'B<A<...>>' to 'B<const A<...>>'
+}
+
+namespace TypeAlias {
+
+template <typename T> class vector {};
+
+template <int Dimension> class Point;
+template <int dimension, typename T> using Polygon = vector<Point<dimension>>;
+
+void foo(Polygon<3, float>);
+void bar() { foo(Polygon<2, float>()); }
+
+// CHECK-ELIDE-NOTREE: error: no matching function for call to 'foo'
+// CHECK-ELIDE-NOTREE: note: candidate function not viable: no known conversion from 'Polygon<2, [...]>' to 'Polygon<3, [...]>' for 1st argument
+
+enum class X {
+  X1,
+  X2,
+};
+
+template<X x> struct EnumToType;
+
+template <> struct EnumToType<X::X1> { using type = int; };
+
+template <> struct EnumToType<X::X2> { using type = double; };
+
+
+template <X x> using VectorType = vector<typename EnumToType<x>::type>;
+
+template <X x> void D(const VectorType<x>&);
+
+void run() {
+  D<X::X1>(VectorType<X::X2>());
+}
+// CHECK-ELIDE-NOTREE: error: no matching function for call to 'D'
+// CHECK-ELIDE-NOTREE: note: candidate function [with x = TypeAlias::X::X1] not viable: no known conversion from 'VectorType<X::X2>' to 'const VectorType<(TypeAlias::X)0>' for 1st argument
+
 }
 
 // CHECK-ELIDE-NOTREE: {{[0-9]*}} errors generated.
diff --git a/test/Misc/languageOptsOpenCL.cl b/test/Misc/languageOptsOpenCL.cl
index 82a8f3614f378..9651f01a7dc59 100644
--- a/test/Misc/languageOptsOpenCL.cl
+++ b/test/Misc/languageOptsOpenCL.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x cl %s -verify
+// RUN: %clang_cc1 -x cl %s -verify -triple spir-unknown-unknown
 // expected-no-diagnostics
 
 // Test the forced language options for OpenCL are set correctly.
diff --git a/test/Misc/nvptx.languageOptsOpenCL.cl b/test/Misc/nvptx.languageOptsOpenCL.cl
new file mode 100644
index 0000000000000..4c7e1539aa3ae
--- /dev/null
+++ b/test/Misc/nvptx.languageOptsOpenCL.cl
@@ -0,0 +1,211 @@
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx64-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple nvptx64-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifdef cl_khr_fp16
+#error "Incorrect cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp16' - ignoring}}
+
+#ifdef cl_khr_int64_base_atomics
+#error "Incorrect cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_base_atomics' - ignoring}}
+
+#ifdef cl_khr_int64_extended_atomics
+#error "Incorrect cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_extended_atomics' - ignoring}}
+
+#ifndef cl_khr_gl_sharing
+#error "Missing cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#if (__OPENCL_C_VERSION__ < 110)
+// Deprecated above 1.0
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+#endif
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+// Core feature in CL 2.0, but not supported on nvptx
+#ifdef cl_khr_3d_image_writes
+#error "Incorrect cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_3d_image_writes' - ignoring}}
+
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Missing cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
diff --git a/test/Misc/r600.languageOptsOpenCL.cl b/test/Misc/r600.languageOptsOpenCL.cl
new file mode 100644
index 0000000000000..58444cf7688a6
--- /dev/null
+++ b/test/Misc/r600.languageOptsOpenCL.cl
@@ -0,0 +1,225 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cayman
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu cypress
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple r600-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES -target-cpu turks
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifdef cl_khr_fp16
+#error "Incorrect cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp16' - ignoring}}
+
+#ifdef cl_khr_int64_base_atomics
+#error "Incorrect cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_base_atomics' - ignoring}}
+
+#ifdef cl_khr_int64_extended_atomics
+#error "Incorrect cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_int64_extended_atomics' - ignoring}}
+
+#ifdef cl_khr_gl_sharing
+#error "Incorrect cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_sharing' - ignoring}}
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+// Deprecated abvoe 1.0
+#ifdef cl_khr_select_fprounding_mode
+#error "Incorrect cl_khr_select_fprounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_select_fprounding_mode' - ignoring}}
+
+
+// Core feature in CL 1.2
+#ifdef __HAS_FP64__
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+#else
+#ifdef cl_khr_fp64
+#error "Incorrect cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif // __HAS_FP64__
+
+//Core feature in CL 2.0
+#ifdef cl_khr_3d_image_writes
+#error "Incorrect cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_3d_image_writes' - ignoring}}
+
+
+#ifdef cl_khr_gl_event
+#error "Incorrect cl_khr_gl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+
+#ifdef cl_khr_d3d10_sharing
+#error "Incorrect cl_khr_d3d10_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+
+#ifdef cl_khr_context_abort
+#error "Incorrect cl_context_abort define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+
+#ifdef cl_khr_d3d11_sharing
+#error "Incorrect cl_khr_d3d11_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+
+#ifdef cl_khr_dx9_media_sharing
+#error "Incorrect cl_khr_dx9_media_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+
+#ifdef cl_khr_image2d_from_buffer
+#error "Incorrect cl_khr_image2d_from_buffer define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+
+#ifdef cl_khr_initialize_memory
+#error "Incorrect cl_khr_initialize_memory define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+
+#ifdef cl_khr_gl_depth_images
+#error "Incorrect cl_khr_gl_depth_images define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+
+#ifdef cl_khr_gl_msaa_sharing
+#error "Incorrect cl_khr_gl_msaa_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+
+#ifdef cl_khr_spir
+#error "Incorrect cl_khr_spir define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+
+#ifdef cl_khr_egl_event
+#error "Incorrect cl_khr_egl_event define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+
+#ifdef cl_khr_egl_image
+#error "Incorrect cl_khr_egl_image define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+
+#ifdef cl_khr_srgb_image_writes
+#error "Incorrect cl_khr_srgb_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+
+#ifdef cl_khr_subgroups
+#error "Incorrect cl_khr_subgroups define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+
+#ifdef cl_khr_terminate_context
+#error "Incorrect cl_khr_terminate_context define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
+// expected-warning@-1{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
diff --git a/test/Misc/serialized-diags-driver.c b/test/Misc/serialized-diags-driver.c
index ad07d666c8c12..617ac8c6ef928 100644
--- a/test/Misc/serialized-diags-driver.c
+++ b/test/Misc/serialized-diags-driver.c
@@ -5,10 +5,10 @@
 // doesn't litter the user's system with preprocessed output.
 
 // RUN: rm -f %t
-// RUN: %clang -Wx-unknown-warning -Wall -fsyntax-only --serialize-diagnostics %t.diag %s
+// RUN: %clang -Wx-typoed-warning -Wall -fsyntax-only --serialize-diagnostics %t.diag %s
 // RUN: c-index-test -read-diagnostics %t.diag 2>&1 | FileCheck %s
 
-// CHECK: warning: unknown warning option '-Wx-unknown-warning' [-Wunknown-warning-option] []
+// CHECK: warning: unknown warning option '-Wx-typoed-warning' [-Wunknown-warning-option] []
 
 // CHECK: warning: variable 'voodoo' is uninitialized when used here [-Wuninitialized]
 // CHECK: note: initialize the variable 'voodoo' to silence this warning []
diff --git a/test/Misc/target-parser.c b/test/Misc/target-parser.c
new file mode 100644
index 0000000000000..fb1c8300d7361
--- /dev/null
+++ b/test/Misc/target-parser.c
@@ -0,0 +1,2 @@
+// RUN: not %clang_cc1 -triple armv7--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s
+// CHECK: error: unknown target CPU 'not-a-cpu'
diff --git a/test/Misc/thinlto.c b/test/Misc/thinlto.c
index 9134cbe5c5eec..ec0f03f5a2af2 100644
--- a/test/Misc/thinlto.c
+++ b/test/Misc/thinlto.c
@@ -1,9 +1,4 @@
 // RUN: %clang_cc1 -flto=thin -emit-llvm-bc < %s | llvm-bcanalyzer -dump | FileCheck %s
-// CHECK: <FUNCTION_SUMMARY_BLOCK
-// CHECK-NEXT: <PERMODULE_ENTRY
-// CHECK-NEXT: <PERMODULE_ENTRY
-// CHECK-NEXT: </FUNCTION_SUMMARY_BLOCK
-
-__attribute__((noinline)) void foo() {}
-
-int main() { foo(); }
+// ; Check that the -flto=thin option emits a summary
+// CHECK: <GLOBALVAL_SUMMARY_BLOCK
+int main() {}
diff --git a/test/Modules/DebugInfoSubmoduleImport.c b/test/Modules/DebugInfoSubmoduleImport.c
index 9fb5d9c6d5c95..1b31aada9c6a2 100644
--- a/test/Modules/DebugInfoSubmoduleImport.c
+++ b/test/Modules/DebugInfoSubmoduleImport.c
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
 // RUN:     -fimplicit-module-maps -x c -fmodules-cache-path=%t -I %S/Inputs \
-// RUN:     %s -emit-llvm -o - | FileCheck %s
+// RUN:     %s -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
 #include "DebugSubmoduleA.h"
 #include "DebugSubmoduleB.h"
 
diff --git a/test/Modules/DebugInfoTransitiveImport.m b/test/Modules/DebugInfoTransitiveImport.m
index 206be2e42e92c..034a909333cd8 100644
--- a/test/Modules/DebugInfoTransitiveImport.m
+++ b/test/Modules/DebugInfoTransitiveImport.m
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
 // RUN:     -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs \
-// RUN:     %s -mllvm -debug-only=pchcontainer 2>&1 | FileCheck %s
+// RUN:     %s -mllvm -debug-only=pchcontainer -debugger-tuning=lldb 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 @import diamond_left;
@@ -20,3 +20,9 @@
 // Skeleton for top:
 // CHECK: !DICompileUnit({{.*}}splitDebugFilename: {{.*}}diamond_top{{.*}}dwoId:
 
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodule-format=obj -debug-info-kind=limited -dwarf-ext-refs \
+// RUN:     -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs \
+// RUN:     %s -mllvm -debug-only=pchcontainer 2>&1 | FileCheck %s --check-prefix=NOIMPORT
+
+// NOIMPORT-NOT: !DIImportedEntity
diff --git a/test/Modules/ExtDebugInfo.cpp b/test/Modules/ExtDebugInfo.cpp
index b255042cbfec4..dbf79f4ff148c 100644
--- a/test/Modules/ExtDebugInfo.cpp
+++ b/test/Modules/ExtDebugInfo.cpp
@@ -2,7 +2,8 @@
 // Test that only forward declarations are emitted for types dfined in modules.
 
 // Modules:
-// RUN: %clang_cc1 -x objective-c++ -std=c++11 -debug-info-kind=limited -dwarf-ext-refs -fmodules \
+// RUN: %clang_cc1 -x objective-c++ -std=c++11 -debug-info-kind=standalone \
+// RUN:     -dwarf-ext-refs -fmodules                                   \
 // RUN:     -fmodule-format=obj -fimplicit-module-maps -DMODULES \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t-mod.ll
@@ -12,10 +13,12 @@
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodule-format=obj -emit-pch -I%S/Inputs \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -o %t.pch %S/Inputs/DebugCXX.h
-// RUN: %clang_cc1 -std=c++11 -debug-info-kind=limited -dwarf-ext-refs -fmodule-format=obj \
+// RUN: %clang_cc1 -std=c++11 -debug-info-kind=standalone \
+// RUN:     -dwarf-ext-refs -fmodule-format=obj \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -include-pch %t.pch %s -emit-llvm -o %t-pch.ll %s
 // RUN: cat %t-pch.ll |  FileCheck %s
+// RUN: cat %t-pch.ll |  FileCheck %s --check-prefix=CHECK-PCH
 
 #ifdef MODULES
 @import DebugCXX;
@@ -25,9 +28,13 @@ using DebugCXX::Struct;
 
 Struct s;
 DebugCXX::Enum e;
+
+// Template instantiations.
 DebugCXX::Template<long> implicitTemplate;
 DebugCXX::Template<int> explicitTemplate;
-DebugCXX::FloatInstatiation typedefTemplate;
+DebugCXX::FloatInstantiation typedefTemplate;
+DebugCXX::B anchoredTemplate;
+
 int Struct::static_member = -1;
 enum {
   e3 = -1
@@ -35,38 +42,149 @@ enum {
 auto anon_enum = DebugCXX::e2;
 char _anchor = anon_enum + conflicting_uid;
 
-// CHECK: ![[NS:.*]] = !DINamespace(name: "DebugCXX", scope: ![[MOD:[0-9]+]],
-// CHECK: ![[MOD]] = !DIModule(scope: null, name: {{.*}}DebugCXX
+TypedefUnion tdu;
+TypedefEnum tde;
+TypedefStruct tds;
+TypedefTemplate tdt;
+Template1<int> explicitTemplate1;
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Struct",
-// CHECK-SAME:             scope: ![[NS]],
+template <class T> class FwdDeclTemplate { T t; };
+TypedefFwdDeclTemplate tdfdt;
+
+InAnonymousNamespace anon;
+
+// Types that are forward-declared in the module and defined here.
+struct PureFwdDecl { int i; };
+PureFwdDecl definedLocally;
+
+struct Specialized<int>::Member { int i; };
+struct Specialized<int>::Member definedLocally2;
+
+template <class T> struct FwdDeclTemplateMember<T>::Member { T t; };
+TypedefFwdDeclTemplateMember tdfdtm;
+
+void foo() {
+  anon.i = GlobalStruct.i = GlobalUnion.i = GlobalEnum;
+}
+
+
+// CHECK: ![[STRUCT:.*]] = !DICompositeType(tag: DW_TAG_structure_type, name: "Struct",
+// CHECK-SAME:             scope: ![[NS:[0-9]+]],
 // CHECK-SAME:             flags: DIFlagFwdDecl,
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX6StructE")
 
+// CHECK: ![[NS]] = !DINamespace(name: "DebugCXX", scope: ![[MOD:[0-9]+]],
+// CHECK: ![[MOD]] = !DIModule(scope: null, name: {{.*}}DebugCXX
+
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Enum",
 // CHECK-SAME:             scope: ![[NS]],
 // CHECK-SAME:             flags: DIFlagFwdDecl,
 // CHECK-SAME:             identifier:  "_ZTSN8DebugCXX4EnumE")
 
+// This type is anchored in the module by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
+// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long> >",
+// CHECK-SAME:             scope: ![[NS]],
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIlNS_6traitsIlEEEE")
 
+// This type is anchored in the module by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
 // CHECK-SAME:             name: "Template<int, DebugCXX::traits<int> >",
 // CHECK-SAME:             scope: ![[NS]],
 // CHECK-SAME:             flags: DIFlagFwdDecl,
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIiNS_6traitsIiEEEE")
 
+// This type isn't, however, even with standalone non-module debug info this
+// type is a forward declaration.
+// CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "traits<int>",
+
+// This one isn't.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
 // CHECK-SAME:             name: "Template<float, DebugCXX::traits<float> >",
 // CHECK-SAME:             scope: ![[NS]],
-// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIfNS_6traitsIfEEEE")
 
+// This type is anchored in the module by an explicit template instantiation.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "traits<float>",
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX6traitsIfEE")
+
+
+// This type is anchored in the module by an explicit template instantiation.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A<void>",
+// CHECK-SAME:             scope: ![[NS]],
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX1AIJvEEE")
+
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "static_member",
-// CHECK-SAME:           scope: !"_ZTSN8DebugCXX6StructE"
+// CHECK-SAME:           scope: ![[STRUCT]]
+
+// CHECK: !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTS12TypedefUnion")
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTS11TypedefEnum")
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTS13TypedefStruct")
+
+// This one isn't.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "Template1<void *>",
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTS9Template1IPvE")
+
+// This type is anchored in the module by an explicit template instantiation.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "Template1<int>",
+// CHECK-SAME:             flags: DIFlagFwdDecl,
+// CHECK-SAME:             identifier: "_ZTS9Template1IiE")
+
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "FwdDeclTemplate<int>",
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTS15FwdDeclTemplateIiE")
+
+// This type is defined locally and forward-declared in the module.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "PureFwdDecl",
+// CHECK-SAME:             elements:
+// CHECK-SAME:             identifier: "_ZTS11PureFwdDecl")
+
+// This type is defined locally and forward-declared in the module.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Member",
+// CHECK-SAME:             elements:
+// CHECK-SAME:             identifier: "_ZTSN11SpecializedIiE6MemberE")
+
+// This type is defined locally and forward-declared in the module.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Member",
+// CHECK-SAME:             elements:
+// CHECK-SAME:             identifier: "_ZTSN21FwdDeclTemplateMemberIiE6MemberE")
+
 
 // CHECK: !DIGlobalVariable(name: "anon_enum", {{.*}}, type: ![[ANON_ENUM:[0-9]+]]
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, scope: ![[NS]],
 // CHECK-SAME:             line: 16
 
-// CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: !"_ZTSN8DebugCXX6StructE", line: 24)
+// CHECK: !DIGlobalVariable(name: "GlobalUnion",
+// CHECK-SAME:              type: ![[GLOBAL_UNION:[0-9]+]]
+// CHECK: ![[GLOBAL_UNION]] = distinct !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-SAME:                elements: !{{[0-9]+}})
+// CHECK: !DIGlobalVariable(name: "GlobalStruct",
+// CHECK-SAME:              type: ![[GLOBAL_STRUCT:[0-9]+]]
+// CHECK: ![[GLOBAL_STRUCT]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:                elements: !{{[0-9]+}})
+
+
+// CHECK: !DIGlobalVariable(name: "anon",
+// CHECK-SAME:              type: ![[GLOBAL_ANON:[0-9]+]]
+// CHECK: ![[GLOBAL_ANON]] = !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:              name: "InAnonymousNamespace", {{.*}}DIFlagFwdDecl)
+
+
+// CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !0, entity: ![[STRUCT]], line: 27)
+
+// CHECK: !DICompileUnit(
+// CHECK-SAME:           splitDebugFilename:
+// CHECK-SAME:           dwoId:
+// CHECK-PCH: !DICompileUnit({{.*}}splitDebugFilename:
+// CHECK-PCH:                dwoId: 18446744073709551614
diff --git a/test/Modules/ExtDebugInfo.m b/test/Modules/ExtDebugInfo.m
index 8e063f09048dd..71ca853fb7d95 100644
--- a/test/Modules/ExtDebugInfo.m
+++ b/test/Modules/ExtDebugInfo.m
@@ -18,18 +18,59 @@
 @import DebugObjC;
 #endif
 
+@implementation ObjCClassWithPrivateIVars {
+  int hidden_ivar;
+}
+@end
+
+TypedefUnion tdu;
+TypedefEnum tde;
+TypedefStruct tds;
+
 int foo(ObjCClass *c) {
   InnerEnum e = e0;
+  GlobalStruct.i = GlobalUnion.i = GlobalEnum;
   [c instanceMethodWithInt: 0];
   return [c property];
 }
 
-// CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClassWithPrivateIVars",
+// CHECK-SAME:             flags: DIFlagObjcClassComplete
+
+// CHECK: ![[MOD:.*]] = !DIModule(scope: null, name: "DebugObjC
+
+// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "hidden_ivar",
+// CHECK-SAME:           flags: DIFlagPrivate)
+
+// CHECK: !DIGlobalVariable(name: "GlobalUnion",
+// CHECK-SAME:              type: ![[GLOBAL_UNION:[0-9]+]]
+// CHECK: ![[GLOBAL_UNION]] = distinct !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-SAME:                elements: !{{[0-9]+}})
+
+// CHECK: !DIGlobalVariable(name: "GlobalStruct",
+// CHECK-SAME:              type: ![[GLOBAL_STRUCT:[0-9]+]]
+// CHECK: ![[GLOBAL_STRUCT]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:                elements: !{{[0-9]+}})
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefUnion",
+// CHECK-SAME:           baseType: ![[TD_UNION:.*]])
+// CHECK: ![[TD_UNION]] = !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl)
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefEnum",
+// CHECK-SAME:           baseType: ![[TD_ENUM:.*]])
+// CHECK: ![[TD_ENUM]] = !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl)
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefStruct",
+// CHECK-SAME:           baseType: ![[TD_STRUCT:.*]])
+// CHECK: ![[TD_STRUCT]] = !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:             flags: DIFlagFwdDecl)
+
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClass",
-// CHECK-SAME:             scope: ![[MOD:[0-9]+]],
+// CHECK-SAME:             scope: ![[MOD]],
 // CHECK-SAME:             flags: DIFlagFwdDecl)
-// CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK: ![[MOD]] = !DIModule(scope: null, name: {{.*}}DebugObjC
+
 // CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type,
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
 // CHECK-SAME:             scope: ![[MOD]],
diff --git a/test/Modules/Inputs/DebugCXX.h b/test/Modules/Inputs/DebugCXX.h
index b6a52579fc3a1..793ad209f8bf2 100644
--- a/test/Modules/Inputs/DebugCXX.h
+++ b/test/Modules/Inputs/DebugCXX.h
@@ -24,10 +24,11 @@ namespace DebugCXX {
           > class Template {
     T member;
   };
+  // Explicit template instantiation.
   extern template class Template<int>;
 
   extern template struct traits<float>;
-  typedef class Template<float> FloatInstatiation;
+  typedef class Template<float> FloatInstantiation;
 
   inline void fn() {
     Template<long> invisible;
@@ -48,6 +49,7 @@ namespace DebugCXX {
   template <typename...> class A;
   template <typename T> class A<T> {};
   typedef A<void> B;
+  // Anchored by a function parameter.
   void foo(B) {}
 }
 
@@ -58,3 +60,48 @@ class FwdVirtual {
 };
 
 struct PureForwardDecl;
+
+typedef union { int i; } TypedefUnion;
+typedef enum { e0 = 0 } TypedefEnum;
+typedef struct { int i; } TypedefStruct;
+
+union { int i; } GlobalUnion;
+struct { int i; } GlobalStruct;
+enum { e5 = 5 } GlobalEnum;
+
+namespace {
+  namespace {
+    struct InAnonymousNamespace { int i; };
+  }
+}
+
+class Base;
+class A {
+  virtual Base *getParent() const;
+};
+class Base {};
+class Derived : Base {
+  class B : A {
+    Derived *getParent() const override;
+  };
+};
+
+template <class T>
+class Template1 {
+  T t;
+};
+typedef Template1<void *> TypedefTemplate;
+extern template class Template1<int>;
+
+template <class T> class FwdDeclTemplate;
+typedef FwdDeclTemplate<int> TypedefFwdDeclTemplate;
+
+// Member classes of class template specializations.
+template <typename T> struct Specialized {};
+
+template <> struct Specialized<int> {
+  struct Member;
+};
+
+template <class T> struct FwdDeclTemplateMember { struct Member; };
+typedef FwdDeclTemplateMember<int>::Member TypedefFwdDeclTemplateMember;
diff --git a/test/Modules/Inputs/DebugObjC.h b/test/Modules/Inputs/DebugObjC.h
index bde463abfd61d..af1cd981ce1c2 100644
--- a/test/Modules/Inputs/DebugObjC.h
+++ b/test/Modules/Inputs/DebugObjC.h
@@ -1,3 +1,4 @@
+// -*- ObjC -*-
 @class FwdDecl;
 
 @interface ObjCClass {
@@ -9,6 +10,11 @@
 @property int property;
 @end
 
+@interface ObjCClassWithPrivateIVars {
+  int public_ivar;
+}
+@end
+
 @interface ObjCClass (Category)
 - categoryMethod;
 @end
@@ -22,3 +28,17 @@ typedef enum {
 + (InnerEnum)protocolMethod;
 
 @end
+
+struct FwdDeclared;
+struct FwdDeclared {
+  int i;
+};
+struct PureForwardDecl;
+
+typedef union { int i; } TypedefUnion;
+typedef enum { e1 = 1 } TypedefEnum;
+typedef struct { int i; } TypedefStruct;
+
+union { int i; } GlobalUnion;
+struct { int i; } GlobalStruct;
+enum { e2 = 2 } GlobalEnum;
diff --git a/test/Modules/Inputs/MacroFabs1.h b/test/Modules/Inputs/MacroFabs1.h
new file mode 100644
index 0000000000000..a78c5539f4a20
--- /dev/null
+++ b/test/Modules/Inputs/MacroFabs1.h
@@ -0,0 +1,6 @@
+
+#undef fabs
+#define fabs(x) (x)
+
+#undef my_fabs
+#define my_fabs(x) (x)
diff --git a/test/Modules/Inputs/MethodPoolCombined1.h b/test/Modules/Inputs/MethodPoolCombined1.h
new file mode 100644
index 0000000000000..057b7388b35fa
--- /dev/null
+++ b/test/Modules/Inputs/MethodPoolCombined1.h
@@ -0,0 +1,6 @@
+
+@import MethodPoolString1;
+@interface A
+- (int)stringValue;
+@end
+
diff --git a/test/Modules/Inputs/MethodPoolCombined2.h b/test/Modules/Inputs/MethodPoolCombined2.h
new file mode 100644
index 0000000000000..166906e32e391
--- /dev/null
+++ b/test/Modules/Inputs/MethodPoolCombined2.h
@@ -0,0 +1 @@
+@import MethodPoolString2;
diff --git a/test/Modules/Inputs/MethodPoolString1.h b/test/Modules/Inputs/MethodPoolString1.h
new file mode 100644
index 0000000000000..c64ad950033e8
--- /dev/null
+++ b/test/Modules/Inputs/MethodPoolString1.h
@@ -0,0 +1,4 @@
+
+@interface S1
+- (int)stringValue;
+@end
diff --git a/test/Modules/Inputs/MethodPoolString2.h b/test/Modules/Inputs/MethodPoolString2.h
new file mode 100644
index 0000000000000..30e9bfb0027d3
--- /dev/null
+++ b/test/Modules/Inputs/MethodPoolString2.h
@@ -0,0 +1,4 @@
+
+@interface S2
+- (int)stringValue;
+@end
diff --git a/test/Modules/Inputs/PR21547/FirstHeader.h b/test/Modules/Inputs/PR21547/FirstHeader.h
new file mode 100644
index 0000000000000..a01c6075932f9
--- /dev/null
+++ b/test/Modules/Inputs/PR21547/FirstHeader.h
@@ -0,0 +1,13 @@
+template<class Element> struct TMatrixT;
+typedef TMatrixT<double> TMatrixD;
+
+void f(const TMatrixD &m);
+
+template<class Element> struct TMatrixT {
+  template <class Element2> TMatrixT(const TMatrixT<Element2> &);
+  ~TMatrixT() {}
+  void Determinant () { f(*this); }
+};
+
+template struct TMatrixT<float>;
+template struct TMatrixT<double>;
diff --git a/test/Modules/Inputs/PR21547/module.modulemap b/test/Modules/Inputs/PR21547/module.modulemap
new file mode 100644
index 0000000000000..8ca0643888b8b
--- /dev/null
+++ b/test/Modules/Inputs/PR21547/module.modulemap
@@ -0,0 +1,4 @@
+module M {
+  header "FirstHeader.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR24954/A.h b/test/Modules/Inputs/PR24954/A.h
new file mode 100644
index 0000000000000..5e5d5bf92ce40
--- /dev/null
+++ b/test/Modules/Inputs/PR24954/A.h
@@ -0,0 +1,10 @@
+#include "B.h"
+
+template <class T>
+class Expr {
+public:
+   void print(B::basic_ostream<char>& os) {
+     os << B::setw(42);
+     os << B::endl;
+  }
+};
diff --git a/test/Modules/Inputs/PR24954/B.h b/test/Modules/Inputs/PR24954/B.h
new file mode 100644
index 0000000000000..a8ddc71892709
--- /dev/null
+++ b/test/Modules/Inputs/PR24954/B.h
@@ -0,0 +1,30 @@
+namespace B {
+
+  template <class _CharT>
+  struct basic_ostream {
+    basic_ostream& operator<<(basic_ostream& (*__pf)());
+  };
+
+
+  template <class _CharT> basic_ostream<_CharT>&
+  endl();
+
+  struct S1 {
+    template <class _CharT> friend void
+    operator<<(basic_ostream<_CharT>& __os, const S1& __x);
+  };
+
+  S1 setw(int __n);
+
+  template <class _CharT> class S2;
+
+  template <class _CharT> void
+  operator<<(basic_ostream<_CharT>& __os, const S2<_CharT>& __x);
+
+  template <class _CharT>
+  struct S2 {
+    template <class _Cp> friend void
+    operator<<(basic_ostream<_Cp>& __os, const S2<_Cp>& __x);
+  };
+
+}
diff --git a/test/Modules/Inputs/PR24954/module.modulemap b/test/Modules/Inputs/PR24954/module.modulemap
new file mode 100644
index 0000000000000..49374181d758c
--- /dev/null
+++ b/test/Modules/Inputs/PR24954/module.modulemap
@@ -0,0 +1,9 @@
+module A {
+  header "A.h"
+  export *
+}
+
+module B {
+  header "B.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR25501/Vector.h b/test/Modules/Inputs/PR25501/Vector.h
new file mode 100644
index 0000000000000..9da48303a3a85
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/Vector.h
@@ -0,0 +1,5 @@
+template <typename> struct _Vector_base {};
+struct vector {
+  vector() {}
+  vector(_Vector_base<int>);
+};
diff --git a/test/Modules/Inputs/PR25501/a0.h b/test/Modules/Inputs/PR25501/a0.h
new file mode 100644
index 0000000000000..1a0d306ca7cf6
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/a0.h
@@ -0,0 +1 @@
+#include "Vector.h"
diff --git a/test/Modules/Inputs/PR25501/a1.h b/test/Modules/Inputs/PR25501/a1.h
new file mode 100644
index 0000000000000..1a0d306ca7cf6
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/a1.h
@@ -0,0 +1 @@
+#include "Vector.h"
diff --git a/test/Modules/Inputs/PR25501/a2.h b/test/Modules/Inputs/PR25501/a2.h
new file mode 100644
index 0000000000000..7876f310e85fc
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/a2.h
@@ -0,0 +1,3 @@
+#include "a0.h"
+vector aaa = vector();
+#include "a1.h"
diff --git a/test/Modules/Inputs/PR25501/b.h b/test/Modules/Inputs/PR25501/b.h
new file mode 100644
index 0000000000000..7b519836953a0
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/b.h
@@ -0,0 +1,2 @@
+#include "Vector.h"
+vector aaa = vector();
diff --git a/test/Modules/Inputs/PR25501/module.modulemap b/test/Modules/Inputs/PR25501/module.modulemap
new file mode 100644
index 0000000000000..c6c8d5ca40f39
--- /dev/null
+++ b/test/Modules/Inputs/PR25501/module.modulemap
@@ -0,0 +1,4 @@
+module "a0" { header "a0.h" export * }
+module "a1" { header "a1.h" export * }
+module "a2" { header "a2.h" export * }
+module "b" { header "b.h" export * }
diff --git a/test/Modules/Inputs/PR26014/A.h b/test/Modules/Inputs/PR26014/A.h
new file mode 100644
index 0000000000000..49de5ba479af2
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/A.h
@@ -0,0 +1,13 @@
+#ifndef _LIBCPP_TYPE_TRAITS
+#define _LIBCPP_TYPE_TRAITS
+
+
+template <class _Tp>
+struct underlying_type
+{
+    typedef __underlying_type(_Tp) type;
+};
+
+#endif  // _LIBCPP_TYPE_TRAITS
+
+#include "B.h"
diff --git a/test/Modules/Inputs/PR26014/B.h b/test/Modules/Inputs/PR26014/B.h
new file mode 100644
index 0000000000000..58d1f8ff2a3f5
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/B.h
@@ -0,0 +1,10 @@
+#ifndef _LIBCPP_TYPE_TRAITS
+#define _LIBCPP_TYPE_TRAITS
+
+template <class _Tp>
+struct underlying_type
+{
+    typedef __underlying_type(_Tp) type;
+};
+
+#endif  // _LIBCPP_TYPE_TRAITS
diff --git a/test/Modules/Inputs/PR26014/module.modulemap b/test/Modules/Inputs/PR26014/module.modulemap
new file mode 100644
index 0000000000000..49374181d758c
--- /dev/null
+++ b/test/Modules/Inputs/PR26014/module.modulemap
@@ -0,0 +1,9 @@
+module A {
+  header "A.h"
+  export *
+}
+
+module B {
+  header "B.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR26179/A.h b/test/Modules/Inputs/PR26179/A.h
new file mode 100644
index 0000000000000..c264f4cf9bfb3
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/A.h
@@ -0,0 +1,2 @@
+#include "basic_string.h"
+#include "B.h"
diff --git a/test/Modules/Inputs/PR26179/B.h b/test/Modules/Inputs/PR26179/B.h
new file mode 100644
index 0000000000000..46a109efdb597
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/B.h
@@ -0,0 +1 @@
+#include "basic_string.h"
diff --git a/test/Modules/Inputs/PR26179/basic_string.h b/test/Modules/Inputs/PR26179/basic_string.h
new file mode 100644
index 0000000000000..653ce078451c4
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/basic_string.h
@@ -0,0 +1,12 @@
+#ifndef _GLIBCXX_STRING
+#define _GLIBCXX_STRING 1
+
+template<typename T>
+struct basic_string {
+  static T _S_empty_rep_storage[];
+};
+
+template<typename T>
+T basic_string<T>::_S_empty_rep_storage[sizeof(T)];
+
+#endif
diff --git a/test/Modules/Inputs/PR26179/module.modulemap b/test/Modules/Inputs/PR26179/module.modulemap
new file mode 100644
index 0000000000000..49374181d758c
--- /dev/null
+++ b/test/Modules/Inputs/PR26179/module.modulemap
@@ -0,0 +1,9 @@
+module A {
+  header "A.h"
+  export *
+}
+
+module B {
+  header "B.h"
+  export *
+}
diff --git a/test/Modules/Inputs/PR27041/Rtypes.h b/test/Modules/Inputs/PR27041/Rtypes.h
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/Rtypes.h
@@ -0,0 +1 @@
+
diff --git a/test/Modules/Inputs/PR27041/TGenericClassInfo.h b/test/Modules/Inputs/PR27041/TGenericClassInfo.h
new file mode 100644
index 0000000000000..b43b2c90f22ba
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/TGenericClassInfo.h
@@ -0,0 +1,3 @@
+namespace std {}
+namespace std { enum float_round_style { denorm_present }; }
+#include "TSchemaHelper.h"
diff --git a/test/Modules/Inputs/PR27041/TSchemaHelper.h b/test/Modules/Inputs/PR27041/TSchemaHelper.h
new file mode 100644
index 0000000000000..31f726f4d75f7
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/TSchemaHelper.h
@@ -0,0 +1 @@
+namespace std { enum float_round_style { denorm_present }; }
diff --git a/test/Modules/Inputs/PR27041/module.modulemap b/test/Modules/Inputs/PR27041/module.modulemap
new file mode 100644
index 0000000000000..f0147cdf97624
--- /dev/null
+++ b/test/Modules/Inputs/PR27041/module.modulemap
@@ -0,0 +1,2 @@
+module "Rtypes.h" { header "Rtypes.h" header "TGenericClassInfo.h" }
+module "TSchemaHelper.h" { header "TSchemaHelper.h" }
diff --git a/test/Modules/Inputs/PR27186/Rtypes.h b/test/Modules/Inputs/PR27186/Rtypes.h
new file mode 100644
index 0000000000000..ecbe10d34d29f
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/Rtypes.h
@@ -0,0 +1,2 @@
+#include <stddef.h>
+typedef struct timespec timespec_t;
diff --git a/test/Modules/Inputs/PR27186/module.modulemap b/test/Modules/Inputs/PR27186/module.modulemap
new file mode 100644
index 0000000000000..58ce19d0d55cb
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/module.modulemap
@@ -0,0 +1,5 @@
+module "Rtypes.h" { header "Rtypes.h" }
+module a [extern_c] {
+  header "stddef.h"
+  header "time.h"
+}
diff --git a/test/Modules/Inputs/PR27186/stddef.h b/test/Modules/Inputs/PR27186/stddef.h
new file mode 100644
index 0000000000000..8b137891791fe
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/stddef.h
@@ -0,0 +1 @@
+
diff --git a/test/Modules/Inputs/PR27186/time.h b/test/Modules/Inputs/PR27186/time.h
new file mode 100644
index 0000000000000..9ac2ace3734b4
--- /dev/null
+++ b/test/Modules/Inputs/PR27186/time.h
@@ -0,0 +1 @@
+struct timespec;
diff --git a/test/Modules/Inputs/PR27401/a.h b/test/Modules/Inputs/PR27401/a.h
new file mode 100644
index 0000000000000..63d6b707f46f1
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/a.h
@@ -0,0 +1,17 @@
+#ifndef _LIBCPP_ALGORITHM
+#define _LIBCPP_ALGORITHM
+template <class _Tp, _Tp>
+struct integral_constant {
+  static const _Tp value = _Tp();
+};
+
+template <class _Tp>
+struct is_nothrow_default_constructible
+	: integral_constant<bool, __is_constructible(_Tp)> {};
+
+template <class _Tp>
+struct is_nothrow_move_constructible
+    : integral_constant<bool, __is_constructible(_Tp, _Tp)> {};
+
+class allocator {};
+#endif
diff --git a/test/Modules/Inputs/PR27401/b.h b/test/Modules/Inputs/PR27401/b.h
new file mode 100644
index 0000000000000..2b4e7f14fbb40
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/b.h
@@ -0,0 +1,21 @@
+#include "a.h"
+#ifndef _LIBCPP_VECTOR
+template <class, class _Allocator>
+class __vector_base {
+protected:
+  _Allocator __alloc() const;
+  __vector_base(_Allocator);
+};
+
+template <class _Tp, class _Allocator = allocator>
+class vector : __vector_base<_Tp, _Allocator> {
+public:
+  vector() noexcept(is_nothrow_default_constructible<_Allocator>::value);
+  vector(const vector &);
+  vector(vector &&)
+      noexcept(is_nothrow_move_constructible<_Allocator>::value);
+};
+
+#endif
+void GetUniquePtrType() { vector<char> v; }
+
diff --git a/test/Modules/Inputs/PR27401/module.modulemap b/test/Modules/Inputs/PR27401/module.modulemap
new file mode 100644
index 0000000000000..a0efadaa0eaf5
--- /dev/null
+++ b/test/Modules/Inputs/PR27401/module.modulemap
@@ -0,0 +1 @@
+module "b" { header "b.h" export * }
diff --git a/test/Modules/Inputs/PR27513/a.h b/test/Modules/Inputs/PR27513/a.h
new file mode 100644
index 0000000000000..7eecbf4773d8b
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/a.h
@@ -0,0 +1,5 @@
+#include "b.h"
+
+inline void f() { basic_string<char> s; }
+
+#include "c.h"
diff --git a/test/Modules/Inputs/PR27513/b.h b/test/Modules/Inputs/PR27513/b.h
new file mode 100644
index 0000000000000..b514c1e3476af
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b.h
@@ -0,0 +1,3 @@
+#include "mystring.h"
+#include "b1.h"
+#include "b2.h"
diff --git a/test/Modules/Inputs/PR27513/b1.h b/test/Modules/Inputs/PR27513/b1.h
new file mode 100644
index 0000000000000..a12b29f71bf92
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1.h
@@ -0,0 +1 @@
+#include "b11.h"
diff --git a/test/Modules/Inputs/PR27513/b11.h b/test/Modules/Inputs/PR27513/b11.h
new file mode 100644
index 0000000000000..e7bfaec9ce7ce
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b11.h
@@ -0,0 +1,2 @@
+#include "mystring.h"
+#include "b111.h"
diff --git a/test/Modules/Inputs/PR27513/b111.h b/test/Modules/Inputs/PR27513/b111.h
new file mode 100644
index 0000000000000..b7a63b5c2324b
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b111.h
@@ -0,0 +1,3 @@
+#include "mystring.h"
+#include "b1111.h"
+#include "b1112.h"
diff --git a/test/Modules/Inputs/PR27513/b1111.h b/test/Modules/Inputs/PR27513/b1111.h
new file mode 100644
index 0000000000000..3f9cf449b3792
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1111.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/b1112.h b/test/Modules/Inputs/PR27513/b1112.h
new file mode 100644
index 0000000000000..3f9cf449b3792
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b1112.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/b2.h b/test/Modules/Inputs/PR27513/b2.h
new file mode 100644
index 0000000000000..3f9cf449b3792
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/b2.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/c.h b/test/Modules/Inputs/PR27513/c.h
new file mode 100644
index 0000000000000..3f9cf449b3792
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/c.h
@@ -0,0 +1 @@
+#include "mystring.h"
diff --git a/test/Modules/Inputs/PR27513/module.modulemap b/test/Modules/Inputs/PR27513/module.modulemap
new file mode 100644
index 0000000000000..ee2a9ce4aa58d
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/module.modulemap
@@ -0,0 +1,7 @@
+module "c.h" {header "c.h" export *}
+module "b2.h" { header "b2.h" export *}
+module "b.h" {header "b.h" export *}
+module "b111.h" { header "b111.h" export *}
+module "b11.h" { header "b11.h" export *}
+module "b1111.h" { header "b1111.h" export *}
+module "b1112.h" { header "b1112.h" export *}
diff --git a/test/Modules/Inputs/PR27513/mystring.h b/test/Modules/Inputs/PR27513/mystring.h
new file mode 100644
index 0000000000000..95680ed8837be
--- /dev/null
+++ b/test/Modules/Inputs/PR27513/mystring.h
@@ -0,0 +1,8 @@
+#ifndef _GLIBCXX_STRING
+#define _GLIBCXX_STRING
+template<typename> struct basic_string {
+  struct _Alloc_hider {} _M_dataplus;
+  ~basic_string() { _Alloc_hider h; } 
+};
+extern template class basic_string<char>;
+#endif
diff --git a/test/Modules/Inputs/PR27739/DataInputHandler.h b/test/Modules/Inputs/PR27739/DataInputHandler.h
new file mode 100644
index 0000000000000..1ef02ecb8da08
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/DataInputHandler.h
@@ -0,0 +1,19 @@
+template < typename > struct vector {};
+
+#include <map>
+#include "Types.h"
+
+struct TString {
+   TString (char *);
+};
+
+struct TreeInfo {};
+
+class DataInputHandler {
+   void AddTree ();
+   void SignalTreeInfo () {
+      fInputTrees[(char*)""];
+   }
+   map <TString, vector <TreeInfo> >fInputTrees;
+   map <string, bool> fExplicitTrainTest;
+};
diff --git a/test/Modules/Inputs/PR27739/Types.h b/test/Modules/Inputs/PR27739/Types.h
new file mode 100644
index 0000000000000..6d458a8d98261
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/Types.h
@@ -0,0 +1 @@
+#include <map>
diff --git a/test/Modules/Inputs/PR27739/map b/test/Modules/Inputs/PR27739/map
new file mode 100644
index 0000000000000..612685c9e5bd4
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/map
@@ -0,0 +1,20 @@
+#ifndef _GLIBCXX_MAP
+#define _GLIBCXX_MAP
+struct basic_string {
+  basic_string(char *);
+} typedef string;
+
+template <typename> class D;
+template <typename _Elements> struct D {
+  _Elements _M_;
+  D(D &) = default;
+};
+
+template <typename _Elements> D<_Elements &&> forward_as_tuple(_Elements);
+
+template <typename _Key, typename _Tp> struct map {
+  _Tp operator[](_Key p1) {
+    auto b = &forward_as_tuple(p1);
+  }
+};
+#endif
diff --git a/test/Modules/Inputs/PR27739/module.modulemap b/test/Modules/Inputs/PR27739/module.modulemap
new file mode 100644
index 0000000000000..d611e8079e51b
--- /dev/null
+++ b/test/Modules/Inputs/PR27739/module.modulemap
@@ -0,0 +1,2 @@
+module "DataInputHandler.h" { header "DataInputHandler.h" export * }
+module "Types.h" { header "Types.h" export *}
diff --git a/test/Modules/Inputs/PR27754/RConversionRuleParser.h b/test/Modules/Inputs/PR27754/RConversionRuleParser.h
new file mode 100644
index 0000000000000..057dd14698a9c
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/RConversionRuleParser.h
@@ -0,0 +1,4 @@
+#include "algobase.h"
+typedef integral_constant<bool, true> true_type;
+class _Rb_tree { _Rb_tree() { true_type(); } };
+#include "TSchemaType.h"
diff --git a/test/Modules/Inputs/PR27754/TMetaUtils.h b/test/Modules/Inputs/PR27754/TMetaUtils.h
new file mode 100644
index 0000000000000..835b7c6dc607e
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/TMetaUtils.h
@@ -0,0 +1,2 @@
+#include "RConversionRuleParser.h"
+void fn1() { true_type(); }
diff --git a/test/Modules/Inputs/PR27754/TSchemaType.h b/test/Modules/Inputs/PR27754/TSchemaType.h
new file mode 100644
index 0000000000000..2c477931707b9
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/TSchemaType.h
@@ -0,0 +1,2 @@
+#include "algobase.h"
+struct A : integral_constant<bool, true> {};
diff --git a/test/Modules/Inputs/PR27754/algobase.h b/test/Modules/Inputs/PR27754/algobase.h
new file mode 100644
index 0000000000000..f5e47d8dc7d3f
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/algobase.h
@@ -0,0 +1,4 @@
+#ifndef _STL_ALGOBASE_H
+#define _STL_ALGOBASE_H
+template<typename _Tp, _Tp> struct integral_constant {};
+#endif
diff --git a/test/Modules/Inputs/PR27754/module.modulemap b/test/Modules/Inputs/PR27754/module.modulemap
new file mode 100644
index 0000000000000..90dcdbb92b4e4
--- /dev/null
+++ b/test/Modules/Inputs/PR27754/module.modulemap
@@ -0,0 +1,3 @@
+module "RConversionRuleParser.h" { header "RConversionRuleParser.h" }
+module "TMetaUtils.h" { header "TMetaUtils.h" }
+module "TSchemaType.h" { header "TSchemaType.h" }
diff --git a/test/Modules/Inputs/PR27890/a.h b/test/Modules/Inputs/PR27890/a.h
new file mode 100644
index 0000000000000..9c6e562116000
--- /dev/null
+++ b/test/Modules/Inputs/PR27890/a.h
@@ -0,0 +1,9 @@
+template <class DataType> DataType values(DataType) { __builtin_va_list ValueArgs; return DataType(); }
+
+template <class DataType>
+class opt {
+public:
+  template <class Mods>
+  opt(Mods) {}
+};
+
diff --git a/test/Modules/Inputs/PR27890/module.modulemap b/test/Modules/Inputs/PR27890/module.modulemap
new file mode 100644
index 0000000000000..85074e8078977
--- /dev/null
+++ b/test/Modules/Inputs/PR27890/module.modulemap
@@ -0,0 +1 @@
+module A { header "a.h" export * }
diff --git a/test/Modules/Inputs/UseAfterFree/UseAfterFreePrivate.h b/test/Modules/Inputs/UseAfterFree/UseAfterFreePrivate.h
new file mode 100644
index 0000000000000..e8ed9fda58745
--- /dev/null
+++ b/test/Modules/Inputs/UseAfterFree/UseAfterFreePrivate.h
@@ -0,0 +1 @@
+@import UseAfterFreePublic;
diff --git a/test/Modules/Inputs/UseAfterFree/UseAfterFreePublic.h b/test/Modules/Inputs/UseAfterFree/UseAfterFreePublic.h
new file mode 100644
index 0000000000000..d017b0f310b41
--- /dev/null
+++ b/test/Modules/Inputs/UseAfterFree/UseAfterFreePublic.h
@@ -0,0 +1,2 @@
+@interface S1
+@end
diff --git a/test/Modules/Inputs/UseAfterFree/module.map b/test/Modules/Inputs/UseAfterFree/module.map
new file mode 100644
index 0000000000000..35c0b3d230d53
--- /dev/null
+++ b/test/Modules/Inputs/UseAfterFree/module.map
@@ -0,0 +1,3 @@
+module UseAfterFreePublic {
+  header "UseAfterFreePublic.h"
+}
diff --git a/test/Modules/Inputs/UseAfterFree/module_private.map b/test/Modules/Inputs/UseAfterFree/module_private.map
new file mode 100644
index 0000000000000..75eec4162c4ec
--- /dev/null
+++ b/test/Modules/Inputs/UseAfterFree/module_private.map
@@ -0,0 +1,3 @@
+module UseAfterFreePrivate {
+  header "UseAfterFreePrivate.h"
+}
diff --git a/test/Modules/Inputs/builtin.h b/test/Modules/Inputs/builtin.h
index 7be90177d1948..4717ff2a52a28 100644
--- a/test/Modules/Inputs/builtin.h
+++ b/test/Modules/Inputs/builtin.h
@@ -1,3 +1,7 @@
 int i;
 int *p = &i;
 
+void use_constant_string_builtins(void) {
+  (void)__builtin___CFStringMakeConstantString("");
+  (void)__builtin___NSStringMakeConstantString("");
+}
diff --git a/test/Modules/Inputs/category_right.h b/test/Modules/Inputs/category_right.h
index 3c83624c76165..d8dedf888d15a 100644
--- a/test/Modules/Inputs/category_right.h
+++ b/test/Modules/Inputs/category_right.h
@@ -1,4 +1,5 @@
 @import category_top;
+#import "category_right_sub.h"
 
 @interface Foo(Right1)
 -(void)right1;
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/A.framework/Headers/A.h b/test/Modules/Inputs/crash-recovery/Frameworks/A.framework/Headers/A.h
new file mode 100644
index 0000000000000..49c9fe07531d8
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/A.framework/Headers/A.h
@@ -0,0 +1 @@
+#include <B/B.h>
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Headers/B.h b/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Headers/B.h
new file mode 100644
index 0000000000000..761540b09cb33
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Headers/B.h
@@ -0,0 +1 @@
+// B.h
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Modules/module.modulemap b/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Modules/module.modulemap
new file mode 100644
index 0000000000000..f6c6e7bf66812
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/B.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module B [extern_c] {
+  umbrella header "B.h"
+  export *
+  module * { export * }
+}
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Headers/I.h b/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Headers/I.h
new file mode 100644
index 0000000000000..f98baaa7bd3f7
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Headers/I.h
@@ -0,0 +1,2 @@
+
+#import <A/A.h>
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Modules/module.modulemap b/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Modules/module.modulemap
new file mode 100644
index 0000000000000..912d39ecf3be6
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/I.framework/Modules/module.modulemap
@@ -0,0 +1,5 @@
+framework module I [extern_c] {
+  umbrella header "I.h"
+  export *
+  module * { export * }
+}
diff --git a/test/Modules/Inputs/crash-recovery/Frameworks/module.modulemap b/test/Modules/Inputs/crash-recovery/Frameworks/module.modulemap
new file mode 100644
index 0000000000000..0f6fcc0b5ada2
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/Frameworks/module.modulemap
@@ -0,0 +1,2 @@
+framework module * [extern_c] {
+}
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/module.map b/test/Modules/Inputs/crash-recovery/usr/include/module.map
new file mode 100644
index 0000000000000..9b429160db502
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/module.map
@@ -0,0 +1,16 @@
+module cstd [system] {
+  // Only in system headers directory
+  module stdio {
+    header "stdio.h"
+  }
+
+  module pthread {
+    header "pthread.h"
+    export *
+
+    module impl {
+      header "pthread_impl.h"
+      export *
+    }
+  }
+}
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/pthread.h b/test/Modules/Inputs/crash-recovery/usr/include/pthread.h
new file mode 100644
index 0000000000000..44ac9fce9520e
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/pthread.h
@@ -0,0 +1 @@
+#include "pthread/pthread_impl.h"
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/pthread/pthread_impl.h b/test/Modules/Inputs/crash-recovery/usr/include/pthread/pthread_impl.h
new file mode 100644
index 0000000000000..21a720a7c3cd1
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/pthread/pthread_impl.h
@@ -0,0 +1 @@
+#define _PTHREAD_MUTEX_SIG_init   0x32AAABA7
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/pthread_impl.h b/test/Modules/Inputs/crash-recovery/usr/include/pthread_impl.h
new file mode 100644
index 0000000000000..21a720a7c3cd1
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/pthread_impl.h
@@ -0,0 +1 @@
+#define _PTHREAD_MUTEX_SIG_init   0x32AAABA7
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/stdio.h b/test/Modules/Inputs/crash-recovery/usr/include/stdio.h
new file mode 100644
index 0000000000000..f41e09c35a412
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/stdio.h
@@ -0,0 +1,3 @@
+typedef struct { int id; } FILE;
+int fprintf(FILE*restrict, const char* restrict format, ...);
+extern FILE *__stderrp;
diff --git a/test/Modules/Inputs/crash-recovery/usr/include/tcl-private/header.h b/test/Modules/Inputs/crash-recovery/usr/include/tcl-private/header.h
new file mode 100644
index 0000000000000..0e8fb64a7121e
--- /dev/null
+++ b/test/Modules/Inputs/crash-recovery/usr/include/tcl-private/header.h
@@ -0,0 +1,2 @@
+// tcl-private/header.h
+#define TCL_PRIVATE 1
diff --git a/test/Modules/Inputs/cxx-decls-imported.h b/test/Modules/Inputs/cxx-decls-imported.h
index a4910fee753e7..0a172150fc145 100644
--- a/test/Modules/Inputs/cxx-decls-imported.h
+++ b/test/Modules/Inputs/cxx-decls-imported.h
@@ -50,3 +50,8 @@ namespace Alias = Aliased;
 
 struct InhCtorA { InhCtorA(int); };
 struct InhCtorB : InhCtorA { using InhCtorA::InhCtorA; };
+
+struct ClassWithVBases : HasFriends, virtual HasNontrivialDefaultConstructor {
+  int n;
+};
+struct ClassWithVBases;
diff --git a/test/Modules/Inputs/cxx-templates-common.h b/test/Modules/Inputs/cxx-templates-common.h
index a9ca624486718..8e730c8a852c5 100644
--- a/test/Modules/Inputs/cxx-templates-common.h
+++ b/test/Modules/Inputs/cxx-templates-common.h
@@ -53,4 +53,21 @@ template<typename T> struct WithAnonymousDecls {
   typedef int X;
 };
 
+namespace hidden_specializations {
+  template<typename T> void fn() {}
+
+  template<typename T> struct cls {
+    static void nested_fn() {}
+    struct nested_cls {};
+    static int nested_var;
+    enum class nested_enum {};
+
+    template<typename U> static void nested_fn_t() {}
+    template<typename U> struct nested_cls_t {};
+    template<typename U> static int nested_var_t;
+  };
+
+  template<typename T> int var;
+}
+
 #include "cxx-templates-textual.h"
diff --git a/test/Modules/Inputs/cxx-templates-unimported.h b/test/Modules/Inputs/cxx-templates-unimported.h
new file mode 100644
index 0000000000000..c2b6b915924b0
--- /dev/null
+++ b/test/Modules/Inputs/cxx-templates-unimported.h
@@ -0,0 +1,43 @@
+#include "cxx-templates-common.h"
+
+namespace hidden_specializations {
+  // explicit specializations
+  template<> void fn<int>() {}
+  template<> struct cls<int> {
+    void nested_fn();
+    struct nested_cls;
+    static int nested_var;
+    enum nested_enum : int;
+  };
+  template<> int var<int>;
+
+  // partial specializations
+  template<typename T> struct cls<T*> {
+    void nested_fn();
+    struct nested_cls;
+    static int nested_var;
+    enum nested_enum : int;
+  };
+  template<typename T> int var<T*>;
+
+  // member specializations
+  template<> void cls<void>::nested_fn() {}
+  template<> struct cls<void>::nested_cls {};
+  template<> int cls<void>::nested_var;
+  template<> enum class cls<void>::nested_enum { e };
+  template<> template<typename U> void cls<void>::nested_fn_t() {}
+  template<> template<typename U> struct cls<void>::nested_cls_t {};
+  template<> template<typename U> int cls<void>::nested_var_t;
+
+  // specializations instantiated here are ok if their pattern is
+  inline void use_stuff() {
+    fn<char>();
+    cls<char>();
+    (void)var<char>;
+    cls<char*>();
+    (void)var<char*>;
+    cls<void>::nested_fn_t<char>();
+    cls<void>::nested_cls_t<char>();
+    (void)cls<void>::nested_var_t<char>;
+  }
+}
diff --git a/test/Modules/Inputs/explicit-build/a.h b/test/Modules/Inputs/explicit-build/a.h
index 5e3602f58ffe7..a52f7357ff75a 100644
--- a/test/Modules/Inputs/explicit-build/a.h
+++ b/test/Modules/Inputs/explicit-build/a.h
@@ -1,4 +1,4 @@
-#if !__building_module(a)
+#if !__building_module(a) && !BUILDING_A_PCH
 #error "should only get here when building module a"
 #endif
 
diff --git a/test/Modules/Inputs/getSourceDescriptor-crash/h1.h b/test/Modules/Inputs/getSourceDescriptor-crash/h1.h
new file mode 100644
index 0000000000000..6f70f09beec22
--- /dev/null
+++ b/test/Modules/Inputs/getSourceDescriptor-crash/h1.h
@@ -0,0 +1 @@
+#pragma once
diff --git a/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap b/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap
new file mode 100644
index 0000000000000..2006ed5fde22a
--- /dev/null
+++ b/test/Modules/Inputs/getSourceDescriptor-crash/module.modulemap
@@ -0,0 +1,3 @@
+module foo {
+       header "h1.h"
+}
diff --git a/test/Modules/Inputs/merge-decl-context/a.h b/test/Modules/Inputs/merge-decl-context/a.h
index 89cc7120fd171..7be90b1535de2 100644
--- a/test/Modules/Inputs/merge-decl-context/a.h
+++ b/test/Modules/Inputs/merge-decl-context/a.h
@@ -21,4 +21,8 @@ inline A<int> ff(int i) {
   return fff<A<int>>(&i);
 }
 
+struct Aggregate {
+  int member;
+};
+
 #endif
diff --git a/test/Modules/Inputs/module.map b/test/Modules/Inputs/module.map
index 632517dd363fb..6846690c84986 100644
--- a/test/Modules/Inputs/module.map
+++ b/test/Modules/Inputs/module.map
@@ -215,6 +215,8 @@ module cxx_linkage_cache {
 
 module cxx_templates_common {
   header "cxx-templates-common.h"
+
+  explicit module unimported { header "cxx-templates-unimported.h" }
 }
 
 module cxx_templates_a {
@@ -393,3 +395,26 @@ module ElaboratedTypeStructs {
     header "elaborated-type-structs.h"
   }
 }
+
+// We import a module, then declare a method with selector stringValue in
+// MethodPoolCombined1.h. In MethodPoolCombined2.h, we import another module
+// that also contains a method for selector stringValue. We make sure that
+// the method pool entry for stringValue in this module is complete.
+module MethodPoolCombined {
+  header "MethodPoolCombined1.h"
+  header "MethodPoolCombined2.h"
+}
+
+module MethodPoolString1 {
+  header "MethodPoolString1.h"
+}
+
+module MethodPoolString2 {
+  header "MethodPoolString2.h"
+}
+
+module Empty {}
+
+module MacroFabs1 {
+  header "MacroFabs1.h"
+}
diff --git a/test/Modules/Inputs/non-module.h b/test/Modules/Inputs/non-module.h
new file mode 100644
index 0000000000000..c295900bd1084
--- /dev/null
+++ b/test/Modules/Inputs/non-module.h
@@ -0,0 +1,4 @@
+#ifndef NON_MODULE_H
+#define NON_MODULE_H
+
+#endif
diff --git a/test/Modules/Inputs/pch-import-module-with-macro.pch b/test/Modules/Inputs/pch-import-module-with-macro.pch
new file mode 100644
index 0000000000000..c06d7728b6202
--- /dev/null
+++ b/test/Modules/Inputs/pch-import-module-with-macro.pch
@@ -0,0 +1,3 @@
+
+@import MacroFabs1;
+
diff --git a/test/Modules/Inputs/suggest-include/empty.h b/test/Modules/Inputs/suggest-include/empty.h
new file mode 100644
index 0000000000000..e69de29bb2d1d
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/empty.h
diff --git a/test/Modules/Inputs/suggest-include/module.modulemap b/test/Modules/Inputs/suggest-include/module.modulemap
new file mode 100644
index 0000000000000..46afd7b2c218f
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/module.modulemap
@@ -0,0 +1,22 @@
+module X {
+  module Empty { header "empty.h" }
+
+  exclude header "textual1.h"
+  textual header "textual2.h"
+  textual header "textual3.h"
+
+  module A { header "usetextual1.h" }
+  module B { header "usetextual2.h" }
+  module C { header "usetextual3.h" }
+  module D { header "usetextual4.h" }
+  module E { header "usetextual5.h" }
+
+  module P { private header "private1.h" }
+  module Q { private header "private2.h" }
+  module R { private header "private3.h" }
+  module S { header "useprivate1.h" export * }
+  module T { header "useprivate3.h" }
+}
+
+module Other { textual header "textual4.h" }
+
diff --git a/test/Modules/Inputs/suggest-include/private1.h b/test/Modules/Inputs/suggest-include/private1.h
new file mode 100644
index 0000000000000..afc7ac71bb46d
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private1.h
@@ -0,0 +1 @@
+extern int private1;
diff --git a/test/Modules/Inputs/suggest-include/private2.h b/test/Modules/Inputs/suggest-include/private2.h
new file mode 100644
index 0000000000000..24a1893d31eb4
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private2.h
@@ -0,0 +1 @@
+extern int private2;
diff --git a/test/Modules/Inputs/suggest-include/private3.h b/test/Modules/Inputs/suggest-include/private3.h
new file mode 100644
index 0000000000000..26852af2a6a82
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/private3.h
@@ -0,0 +1 @@
+extern int private3;
diff --git a/test/Modules/Inputs/suggest-include/textual1.h b/test/Modules/Inputs/suggest-include/textual1.h
new file mode 100644
index 0000000000000..5b18bfb36856b
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual1.h
@@ -0,0 +1 @@
+#define FOO(X) X
diff --git a/test/Modules/Inputs/suggest-include/textual2.h b/test/Modules/Inputs/suggest-include/textual2.h
new file mode 100644
index 0000000000000..0c06d4ea454bd
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual2.h
@@ -0,0 +1 @@
+EXPAND_MACRO
diff --git a/test/Modules/Inputs/suggest-include/textual3.h b/test/Modules/Inputs/suggest-include/textual3.h
new file mode 100644
index 0000000000000..1e52521161033
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual3.h
@@ -0,0 +1 @@
+extern int textual3;
diff --git a/test/Modules/Inputs/suggest-include/textual4.h b/test/Modules/Inputs/suggest-include/textual4.h
new file mode 100644
index 0000000000000..091e0c0ac19dc
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual4.h
@@ -0,0 +1 @@
+extern int textual4;
diff --git a/test/Modules/Inputs/suggest-include/textual5.h b/test/Modules/Inputs/suggest-include/textual5.h
new file mode 100644
index 0000000000000..d808617d50003
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/textual5.h
@@ -0,0 +1 @@
+extern int textual5;
diff --git a/test/Modules/Inputs/suggest-include/useprivate1.h b/test/Modules/Inputs/suggest-include/useprivate1.h
new file mode 100644
index 0000000000000..817b900ecccf1
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/useprivate1.h
@@ -0,0 +1 @@
+#include "private1.h"
diff --git a/test/Modules/Inputs/suggest-include/useprivate3.h b/test/Modules/Inputs/suggest-include/useprivate3.h
new file mode 100644
index 0000000000000..5d5d221b87b83
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/useprivate3.h
@@ -0,0 +1 @@
+#include "private3.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual1.h b/test/Modules/Inputs/suggest-include/usetextual1.h
new file mode 100644
index 0000000000000..34ab1c76bccd2
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual1.h
@@ -0,0 +1,2 @@
+#include "textual1.h"
+FOO(extern int usetextual1;)
diff --git a/test/Modules/Inputs/suggest-include/usetextual2.h b/test/Modules/Inputs/suggest-include/usetextual2.h
new file mode 100644
index 0000000000000..95b2445736364
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual2.h
@@ -0,0 +1,2 @@
+#define EXPAND_MACRO extern int usetextual2;
+#include "textual2.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual3.h b/test/Modules/Inputs/suggest-include/usetextual3.h
new file mode 100644
index 0000000000000..15a75cc839f84
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual3.h
@@ -0,0 +1 @@
+#include "textual3.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual4.h b/test/Modules/Inputs/suggest-include/usetextual4.h
new file mode 100644
index 0000000000000..395bb6fd65b34
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual4.h
@@ -0,0 +1 @@
+#include "textual4.h"
diff --git a/test/Modules/Inputs/suggest-include/usetextual5.h b/test/Modules/Inputs/suggest-include/usetextual5.h
new file mode 100644
index 0000000000000..a7335d37aeb8f
--- /dev/null
+++ b/test/Modules/Inputs/suggest-include/usetextual5.h
@@ -0,0 +1 @@
+#include "textual5.h"
diff --git a/test/Modules/Inputs/typo.h b/test/Modules/Inputs/typo.h
new file mode 100644
index 0000000000000..764c00b7c387f
--- /dev/null
+++ b/test/Modules/Inputs/typo.h
@@ -0,0 +1,6 @@
+@import Empty;
+
+@interface NSString
++ (id)alloc;
+@end
+
diff --git a/test/Modules/ModuleDebugInfo.cpp b/test/Modules/ModuleDebugInfo.cpp
index bbe36cb225d0a..998d36327ef5d 100644
--- a/test/Modules/ModuleDebugInfo.cpp
+++ b/test/Modules/ModuleDebugInfo.cpp
@@ -8,7 +8,6 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -x objective-c++ -std=c++11 -debug-info-kind=limited -fmodules -fmodule-format=obj -fimplicit-module-maps -DMODULES -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t.ll -mllvm -debug-only=pchcontainer &>%t-mod.ll
 // RUN: cat %t-mod.ll | FileCheck %s
 // RUN: cat %t-mod.ll | FileCheck --check-prefix=CHECK-NEG %s
-// RUN: cat %t-mod.ll | FileCheck --check-prefix=CHECK-DWO %s
 
 // PCH:
 // RUN: %clang_cc1 -triple %itanium_abi_triple -x c++ -std=c++11 -emit-pch -fmodule-format=obj -I %S/Inputs -o %t.pch %S/Inputs/DebugCXX.h -mllvm -debug-only=pchcontainer &>%t-pch.ll
@@ -21,25 +20,66 @@
 
 // CHECK: distinct !DICompileUnit(language: DW_LANG_{{.*}}C_plus_plus,
 // CHECK-SAME:                    isOptimized: false,
-// CHECK-SAME-NOT:                splitDebugFilename:
-// CHECK-DWO:                     dwoId:
+// CHECK-NOT:                     splitDebugFilename:
+// CHECK-SAME:                    dwoId:
 
 // CHECK: !DICompositeType(tag: DW_TAG_enumeration_type, name: "Enum"
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX4EnumE")
 // CHECK: !DINamespace(name: "DebugCXX"
 
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             identifier: "_ZTS11TypedefEnum")
+
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+// CHECK: !DIEnumerator(name: "e5", value: 5)
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "B",
+// no mangled name here yet.
+
+// This type is anchored by a function parameter.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A<void>"
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX1AIJvEEE")
+
 // CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Struct"
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX6StructE")
 
+// This type is anchored by an explicit template instantiation.
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
 // CHECK-SAME:             name: "Template<int, DebugCXX::traits<int> >"
+// CHECK-SAME:             templateParams:
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIiNS_6traitsIiEEEE")
 
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A<void>"
-// CHECK-SAME:             identifier: "_ZTSN8DebugCXX1AIJvEEE")
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "traits<int>"
+// CHECK-SAME:             flags: DIFlagFwdDecl
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX6traitsIiEE")
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "traits<float>"
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX6traitsIfEE")
+
+// CHECK: !DICompositeType(tag: DW_TAG_class_type,
+// CHECK-SAME:             name: "Template<long, DebugCXX::traits<long> >"
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIlNS_6traitsIlEEEE")
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "FloatInstantiation"
+// no mangled name here yet.
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type,
 // CHECK-SAME:             name: "Template<float, DebugCXX::traits<float> >"
+// CHECK-SAME:             flags: DIFlagFwdDecl
 // CHECK-SAME:             identifier: "_ZTSN8DebugCXX8TemplateIfNS_6traitsIfEEEE")
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "FwdVirtual"
@@ -47,10 +87,52 @@
 // CHECK-SAME:             identifier: "_ZTS10FwdVirtual")
 // CHECK: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$FwdVirtual"
 
-// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "FloatInstatiation"
-// no mangled name here yet.
+// CHECK: !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             identifier: "_ZTS12TypedefUnion")
 
-// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "B",
-// no mangled name here yet.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             identifier: "_ZTS13TypedefStruct")
+
+// CHECK: !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-SAME:             name: "InAnonymousNamespace",
+// CHECK-SAME:             elements: !{{[0-9]+}})
+
+// CHECK: ![[DERIVED:.*]] = {{.*}}!DICompositeType(tag: DW_TAG_class_type, name: "Derived",
+// CHECK-SAME:                                     identifier: "_ZTS7Derived")
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "B", scope: ![[DERIVED]],
+// CHECK-SAME:             elements: ![[B_MBRS:.*]], vtableHolder:
+// CHECK: ![[B_MBRS]] = !{{{.*}}, ![[GET_PARENT:.*]]}
+// CHECK: ![[GET_PARENT]] = !DISubprogram(name: "getParent"
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefTemplate",
+// CHECK-SAME:           baseType: ![[BASE:.*]])
+// CHECK: ![[BASE]] = !DICompositeType(tag: DW_TAG_class_type,
+// CHECK-SAME:                         name: "Template1<void *>",
+// CHECK-SAME:                         flags: DIFlagFwdDecl,
+// CHECK-SAME:                         identifier: "_ZTS9Template1IPvE")
+
+// Explicit instatiation.
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "Template1<int>",
+// CHECK-SAME:             templateParams:
+// CHECK-SAME:             identifier: "_ZTS9Template1IiE")
+
+// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "FwdDeclTemplate<int>",
+// CHECK-SAME:             flags: DIFlagFwdDecl
+// CHECK-SAME:             identifier: "_ZTS15FwdDeclTemplateIiE")
+
+// Forward-declared member of a template.
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Member",
+// CHECK-SAME:             flags: DIFlagFwdDecl
+// CHECK-SAME:             identifier: "_ZTSN21FwdDeclTemplateMemberIiE6MemberE")
 
 // CHECK-NEG-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "PureForwardDecl"
diff --git a/test/Modules/ModuleDebugInfo.m b/test/Modules/ModuleDebugInfo.m
index 0974f38cc22e9..ce35c7c8361ff 100644
--- a/test/Modules/ModuleDebugInfo.m
+++ b/test/Modules/ModuleDebugInfo.m
@@ -10,13 +10,14 @@
 // RUN:   -I %S/Inputs -I %t -emit-llvm -o %t.ll \
 // RUN:   -mllvm -debug-only=pchcontainer &>%t-mod.ll
 // RUN: cat %t-mod.ll | FileCheck %s
-// RUN: cat %t-mod.ll | FileCheck %s --check-prefix=MODULE-CHECK
+// RUN: cat %t-mod.ll | FileCheck %s --check-prefix=CHECK2
 
 // PCH:
 // RUN: %clang_cc1 -x objective-c -emit-pch -fmodule-format=obj -I %S/Inputs \
 // RUN:   -o %t.pch %S/Inputs/DebugObjC.h \
 // RUN:   -mllvm -debug-only=pchcontainer &>%t-pch.ll
 // RUN: cat %t-pch.ll | FileCheck %s
+// RUN: cat %t-pch.ll | FileCheck %s --check-prefix=CHECK2
 
 #ifdef MODULES
 @import DebugObjC;
@@ -24,28 +25,69 @@
 
 // CHECK: distinct !DICompileUnit(language: DW_LANG_ObjC
 // CHECK-SAME:                    isOptimized: false,
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME:             name: "FwdDecl",
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// CHECK-SAME:             name: "ObjCClass",
-// CHECK: !DIObjCProperty(name: "property",
-// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "ivar"
-// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "InnerEnum"
-// CHECK: !DISubprogram(name: "+[ObjCClass classMethod]"
-// CHECK: !DISubprogram(name: "-[ObjCClass instanceMethodWithInt:]"
-// CHECK: !DISubprogram(name: "-[Category(Category) categoryMethod]"
-
-// MODULE-CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
-// MODULE-CHECK-SAME:             scope: ![[MODULE:[0-9]+]],
-// MODULE-CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC"
-// MODULE-CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// MODULE-CHECK-SAME:             name: "FwdDecl",
-// MODULE-CHECK-SAME:             scope: ![[MODULE]],
-// MODULE-CHECK: !DICompositeType(tag: DW_TAG_structure_type,
-// MODULE-CHECK-SAME:             name: "ObjCClass",
-// MODULE-CHECK-SAME:             scope: ![[MODULE]],
-// MODULE-CHECK: !DISubprogram(name: "+[ObjCClass classMethod]",
-// MODULE-CHECK-SAME:          scope: ![[MODULE]],
+
+// CHECK: !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-SAME:             scope: ![[MODULE:[0-9]+]],
+// CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC
+
+// CHECK: ![[TD_ENUM:.*]] = !DICompositeType(tag: DW_TAG_enumeration_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             elements:
+
+// CHECK: !DISubprogram(name: "+[ObjCClass classMethod]",
+// CHECK-SAME:          scope: ![[MODULE]],
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClass",
+// CHECK-SAME:             scope: ![[MODULE]],
+// CHECK-SAME:             elements
 
 // The forward declaration should not be in the module scope.
-// MODULE-CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "OpaqueData", file
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "OpaqueData", file
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
+// CHECK-SAME:             scope: ![[MODULE]],
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClassWithPrivateIVars",
+// CHECK-SAME:             scope: ![[MODULE]],
+// CHECK-SAME:             elements
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDeclared"
+// CHECK-SAME:             elements:
+
+// CHECK: ![[TD_UNION:.*]] = distinct !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             elements:
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefUnion",
+// CHECK-SAME:           baseType: ![[TD_UNION]])
+
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefEnum",
+// CHECK-SAME:           baseType: ![[TD_ENUM:.*]])
+
+// CHECK: ![[TD_STRUCT:.*]] = distinct !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             elements:
+// CHECK: !DIDerivedType(tag: DW_TAG_typedef, name: "TypedefStruct",
+// CHECK-SAME:           baseType: ![[TD_STRUCT]])
+
+// CHECK: !DICompositeType(tag: DW_TAG_union_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK: !DICompositeType(tag: DW_TAG_structure_type,
+// CHECK-NOT:              name:
+// CHECK-SAME:             )
+
+// CHECK-NEG-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "PureForwardDecl"
+
+// The output order is sublty different for module vs. pch,
+// so these are checked separately:
+//
+// CHECK2: !DISubprogram(name: "+[ObjCClass classMethod]"
+// CHECK2: !DISubprogram(name: "-[ObjCClass instanceMethodWithInt:]"
+// CHECK2: !DICompositeType(tag: DW_TAG_structure_type, name: "ObjCClass",
+// CHECK2: !DIObjCProperty(name: "property",
+// CHECK2: !DIDerivedType(tag: DW_TAG_member, name: "ivar"
+// CHECK2: !DISubprogram(name: "-[Category(Category) categoryMethod]"
+// CHECK2: !DICompositeType(tag: DW_TAG_structure_type, name: "FwdDecl",
+// CHECK2: !DIDerivedType(tag: DW_TAG_typedef, name: "InnerEnum"
diff --git a/test/Modules/builtins.m b/test/Modules/builtins.m
index 33d23979ce79e..2480e6379cc85 100644
--- a/test/Modules/builtins.m
+++ b/test/Modules/builtins.m
@@ -1,10 +1,26 @@
-@import builtin;
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs %s -verify
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs -x c %s -verify
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs -x objective-c++ %s -verify
+
+// RUN: rm -rf %t.pch.cache
+// RUN: %clang_cc1 -fmodules-cache-path=%t.pch.cache -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch -x objective-c-header %S/Inputs/use-builtin.h
+// RUN: %clang_cc1 -fmodules-cache-path=%t.pch.cache -fmodules -fimplicit-module-maps -I %S/Inputs %s -include-pch %t.pch %s -verify
+
+// expected-no-diagnostics
+
+void use_constant_string_builtins1(void) {
+  (void)__builtin___CFStringMakeConstantString("");
+  (void)__builtin___NSStringMakeConstantString("");
+}
+
+#include "builtin.h"
 
 int foo() {
   return __builtin_object_size(p, 0);
 }
 
-@import builtin.sub;
+#include "builtin_sub.h"
 
 int bar() {
   return __builtin_object_size(p, 0);
@@ -14,11 +30,7 @@ int baz() {
   return IS_CONST(0);
 }
 
-// RUN: rm -rf %t
-// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -I %S/Inputs %s -verify
-
-// RUN: rm -rf %t.pch.cache
-// RUN: %clang_cc1 -fmodules-cache-path=%t.pch.cache -fmodules -fimplicit-module-maps -I %S/Inputs -emit-pch -o %t.pch -x objective-c-header %S/Inputs/use-builtin.h
-// RUN: %clang_cc1 -fmodules-cache-path=%t.pch.cache -fmodules -fimplicit-module-maps -I %S/Inputs %s -include-pch %t.pch %s -verify
-
-// expected-no-diagnostics
+void use_constant_string_builtins2(void) {
+  (void)__builtin___CFStringMakeConstantString("");
+  (void)__builtin___NSStringMakeConstantString("");
+}
diff --git a/test/Modules/crash-vfs-path-emptydir-entries.m b/test/Modules/crash-vfs-path-emptydir-entries.m
new file mode 100644
index 0000000000000..01560984ff710
--- /dev/null
+++ b/test/Modules/crash-vfs-path-emptydir-entries.m
@@ -0,0 +1,49 @@
+// REQUIRES: crash-recovery, shell
+
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// Test clang can collect symbolic link headers used in modules.
+// crash reproducer if there's a symbolic link header file used in a module.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t %t/sysroot
+// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only %s -I %/t/i -isysroot %/t/sysroot/ \
+// RUN:     -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "usr/include/stdio.h" | count 1
+
+#include "usr/include/stdio.h"
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-isysroot" "{{[^"]*}}/sysroot/"
+// CHECKSH-NOT: "-fmodules-cache-path="
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'type': 'directory',
+// CHECKYAML: 'name': "",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "pthread_impl.h",
+// CHECKYAML-NEXT:     'external-contents': "/{{.*}}/i/usr/include/pthread_impl.h"
+// CHECKYAML-NEXT:   },
diff --git a/test/Modules/crash-vfs-path-symlink-component.m b/test/Modules/crash-vfs-path-symlink-component.m
new file mode 100644
index 0000000000000..6a41188b271a1
--- /dev/null
+++ b/test/Modules/crash-vfs-path-symlink-component.m
@@ -0,0 +1,68 @@
+// REQUIRES: crash-recovery, shell
+
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// Test that clang is capable of collecting the right header files in the
+// crash reproducer if there's a symbolic link component in the path.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t %t/sysroot
+// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+// RUN: ln -s include/tcl-private %t/i/usr/x
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only %s -I %/t/i -isysroot %/t/sysroot/ \
+// RUN:     -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "usr/include/stdio.h" | count 1
+
+#include "usr/x/../stdio.h"
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-isysroot" "{{[^"]*}}/sysroot/"
+// CHECKSH-NOT: "-fmodules-cache-path="
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'case-sensitive':
+// CHECKYAML-NEXT: 'use-external-names': 'false',
+// CHECKYAML-NEXT: 'overlay-relative': 'true',
+
+// CHECKYAML: 'type': 'directory'
+// CHECKYAML: 'name': "/[[PATH:.*]]/i/usr",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "module.map",
+// CHECKYAML-NEXT:     'external-contents': "/[[PATH]]/i/usr/include/module.map"
+// CHECKYAML-NEXT:   },
+
+// Test that by using the previous generated YAML file clang is able to find the
+// right files inside the overlay and map the virtual request for a path that
+// previously contained a symlink to work. To make sure of this, wipe out the
+// %/t/i directory containing the symlink component.
+
+// RUN: rm -rf %/t/i
+// RUN: unset FORCE_CLANG_DIAGNOSTICS_CRASH
+// RUN: %clang -E %s -I %/t/i -isysroot %/t/sysroot/ \
+// RUN:     -ivfsoverlay %t/crash-vfs-*.cache/vfs/vfs.yaml -fmodules \
+// RUN:     -fmodules-cache-path=%t/m/ 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=CHECKOVERLAY
+
+// CHECKOVERLAY: @import cstd.stdio; /* clang -E: implicit import for "/{{[^ ].*}}/i/usr/x/../stdio.h" */
diff --git a/test/Modules/crash-vfs-path-symlink-topheader.m b/test/Modules/crash-vfs-path-symlink-topheader.m
new file mode 100644
index 0000000000000..72b666a8127cd
--- /dev/null
+++ b/test/Modules/crash-vfs-path-symlink-topheader.m
@@ -0,0 +1,51 @@
+// REQUIRES: crash-recovery, shell
+
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// Test clang can collect symbolic link headers used in modules.
+// crash reproducer if there's a symbolic link header file used in a module.
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t %t/sysroot
+// RUN: cp -a %S/Inputs/crash-recovery/usr %t/i/
+// RUN: rm -f %t/i/usr/include/pthread_impl.h
+// RUN: ln -s pthread/pthread_impl.h %t/i/usr/include/pthread_impl.h
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only %s -I %/t/i -isysroot %/t/sysroot/ \
+// RUN:     -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "usr/include/pthread_impl.h" | count 1
+
+#include "usr/include/stdio.h"
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-isysroot" "{{[^"]*}}/sysroot/"
+// CHECKSH-NOT: "-fmodules-cache-path="
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'type': 'directory',
+// CHECKYAML: 'name': "",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "pthread_impl.h",
+// CHECKYAML-NEXT:     'external-contents': "/{{.*}}/i/usr/include/pthread_impl.h"
+// CHECKYAML-NEXT:   },
diff --git a/test/Modules/crash-vfs-path-traversal.m b/test/Modules/crash-vfs-path-traversal.m
new file mode 100644
index 0000000000000..3377de47d0b2e
--- /dev/null
+++ b/test/Modules/crash-vfs-path-traversal.m
@@ -0,0 +1,65 @@
+// REQUIRES: crash-recovery, shell, non-ms-sdk, non-ps4-sdk
+
+// FIXME: Canonicalizing paths to remove relative traversal components
+// currenty fails a unittest on windows and is disable by default.
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only %s -I %S/Inputs/crash-recovery -isysroot %/t/i/    \
+// RUN: -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "Inputs/crash-recovery/usr/include/stdio.h" | count 1
+
+#include "usr/././//////include/../include/./././../include/stdio.h"
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-isysroot" "{{[^"]*}}/i/"
+// CHECKSH-NOT: "-fmodules-cache-path="
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'case-sensitive':
+// CHECKYAML-NEXT: 'use-external-names': 'false',
+// CHECKYAML-NEXT: 'overlay-relative': 'true',
+// CHECKYAML:     'type': 'directory'
+// CHECKYAML:     'name': "/[[PATH:.*]]/Inputs/crash-recovery/usr/include",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "module.map",
+// CHECKYAML-NEXT:     'external-contents': "/[[PATH]]/Inputs/crash-recovery/usr/include/module.map"
+// CHECKYAML-NEXT:   },
+
+// Replace the paths in the YAML files with relative ".." traversals
+// and fed into clang to test whether we're correctly representing them
+// in the VFS overlay.
+
+// RUN: sed -e "s@usr/include@usr/include/../include@g" \
+// RUN:     %t/crash-vfs-*.cache/vfs/vfs.yaml > %t/vfs.yaml
+// RUN: cp %t/vfs.yaml %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: unset FORCE_CLANG_DIAGNOSTICS_CRASH
+// RUN: %clang -E %s -I %S/Inputs/crash-recovery -isysroot %/t/i/ \
+// RUN:     -ivfsoverlay %t/crash-vfs-*.cache/vfs/vfs.yaml -fmodules \
+// RUN:     -fmodules-cache-path=%t/m/ 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=CHECKOVERLAY
+
+// CHECKOVERLAY: @import cstd.stdio; /* clang -E: implicit import for "/{{[^ ].*}}/usr/././//////include/../include/./././../include/stdio.h" */
diff --git a/test/Modules/crash-vfs-relative-overlay.m b/test/Modules/crash-vfs-relative-overlay.m
new file mode 100644
index 0000000000000..870987c58abdb
--- /dev/null
+++ b/test/Modules/crash-vfs-relative-overlay.m
@@ -0,0 +1,61 @@
+// REQUIRES: crash-recovery, shell
+
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only -nostdinc %s \
+// RUN:     -I %S/Inputs/crash-recovery/usr/include -isysroot %/t/i/ \
+// RUN:     -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "Inputs/crash-recovery/usr/include/stdio.h" | count 1
+
+#include <stdio.h>
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-resource-dir"
+// CHECKSH: "-isysroot" "{{[^"]*}}/i/"
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'case-sensitive':
+// CHECKYAML-NEXT: 'use-external-names': 'false',
+// CHECKYAML-NEXT: 'overlay-relative': 'true',
+// CHECKYAML: 'type': 'directory'
+// CHECKYAML: 'name': "/[[PATH:.*]]/Inputs/crash-recovery/usr/include",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "module.map",
+// CHECKYAML-NOT:      'external-contents': "{{[^ ]*}}.cache
+// CHECKYAML-NEXT:     'external-contents': "/[[PATH]]/Inputs/crash-recovery/usr/include/module.map"
+// CHECKYAML-NEXT:   },
+
+// Test that reading the YAML file will yield the correct path after
+// the overlay dir is prefixed to access headers in .cache/vfs directory.
+
+// RUN: unset FORCE_CLANG_DIAGNOSTICS_CRASH
+// RUN: %clang -E %s -I %S/Inputs/crash-recovery/usr/include -isysroot %/t/i/ \
+// RUN:     -ivfsoverlay %t/crash-vfs-*.cache/vfs/vfs.yaml -fmodules \
+// RUN:     -fmodules-cache-path=%t/m/ 2>&1 \
+// RUN:     | FileCheck %s --check-prefix=CHECKOVERLAY
+
+// CHECKOVERLAY: @import cstd.stdio; /* clang -E: implicit import for "/{{[^ ].*}}/usr/include/stdio.h" */
diff --git a/test/Modules/crash-vfs-run-reproducer.m b/test/Modules/crash-vfs-run-reproducer.m
new file mode 100644
index 0000000000000..d0eaa931c9767
--- /dev/null
+++ b/test/Modules/crash-vfs-run-reproducer.m
@@ -0,0 +1,57 @@
+// REQUIRES: crash-recovery, shell, system-darwin
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -fsyntax-only -nostdinc %s \
+// RUN:     -I %S/Inputs/crash-recovery/usr/include -isysroot %/t/i/ \
+// RUN:     -fmodules -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKSRC %s -input-file %t/crash-vfs-*.m
+// RUN: FileCheck --check-prefix=CHECKSH %s -input-file %t/crash-vfs-*.sh
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN: %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "Inputs/crash-recovery/usr/include/stdio.h" | count 1
+
+#include <stdio.h>
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKSRC: @import cstd.stdio;
+
+// CHECKSH: # Crash reproducer
+// CHECKSH-NEXT: # Driver args: "-fsyntax-only"
+// CHECKSH-NEXT: # Original command: {{.*$}}
+// CHECKSH-NEXT: "-cc1"
+// CHECKSH: "-resource-dir"
+// CHECKSH: "-isysroot" "{{[^"]*}}/i/"
+// CHECKSH: "crash-vfs-{{[^ ]*}}.m"
+// CHECKSH: "-ivfsoverlay" "crash-vfs-{{[^ ]*}}.cache/vfs/vfs.yaml"
+// CHECKSH: "-fmodules-cache-path=crash-vfs-{{[^ ]*}}.cache/modules"
+
+// CHECKYAML: 'case-sensitive':
+// CHECKYAML-NEXT: 'use-external-names': 'false',
+// CHECKYAML-NEXT: 'overlay-relative': 'true',
+// CHECKYAML: 'type': 'directory'
+// CHECKYAML: 'name': "/[[PATH:.*]]/Inputs/crash-recovery/usr/include",
+// CHECKYAML-NEXT: 'contents': [
+// CHECKYAML-NEXT:   {
+// CHECKYAML-NEXT:     'type': 'file',
+// CHECKYAML-NEXT:     'name': "module.map",
+// CHECKYAML-NOT:      'external-contents': "{{[^ ]*}}.cache
+// CHECKYAML-NEXT:     'external-contents': "/[[PATH]]/Inputs/crash-recovery/usr/include/module.map"
+// CHECKYAML-NEXT:   },
+
+// Run the reproducer script - regular exit code is enough to test it works.
+// Note that we don't yet support reusing the modules pcm; what we do
+// support is re-building the modules relying solely on the header files dumped
+// inside .cache/vfs, mapped by .cache/vfs/vfs.yaml.
+
+// RUN: cd %t
+// RUN: rm -rf crash-vfs-run-reproducer-*.cache/modules/*
+// RUN: chmod 755 crash-vfs-*.sh
+// RUN: ./crash-vfs-*.sh
diff --git a/test/Modules/crash-vfs-umbrella-frameworks.m b/test/Modules/crash-vfs-umbrella-frameworks.m
new file mode 100644
index 0000000000000..0c3981ddaa887
--- /dev/null
+++ b/test/Modules/crash-vfs-umbrella-frameworks.m
@@ -0,0 +1,55 @@
+// REQUIRES: crash-recovery, shell
+
+// FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
+// XFAIL: mingw32
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t/i %t/m %t
+// RUN: cp -a %S/Inputs/crash-recovery/Frameworks %t/i/
+// RUN: mkdir -p %t/i/Frameworks/A.framework/Frameworks
+// RUN: ln -s ../../B.framework %t/i/Frameworks/A.framework/Frameworks/B.framework
+
+// RUN: not env FORCE_CLANG_DIAGNOSTICS_CRASH= TMPDIR=%t TEMP=%t TMP=%t \
+// RUN: %clang -nostdinc -fsyntax-only %s \
+// RUN:     -F %/t/i/Frameworks -fmodules \
+// RUN:     -fmodules-cache-path=%t/m/ 2>&1 | FileCheck %s
+
+// RUN: FileCheck --check-prefix=CHECKYAML %s -input-file \
+// RUN:         %t/crash-vfs-*.cache/vfs/vfs.yaml
+// RUN: find %t/crash-vfs-*.cache/vfs | \
+// RUN:   grep "B.framework/Headers/B.h" | count 1
+
+// CHECK: Preprocessed source(s) and associated run script(s) are located at:
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.m
+// CHECK-NEXT: note: diagnostic msg: {{.*}}.cache
+
+// CHECKYAML:      'type': 'directory',
+// CHECKYAML:      'name': "/[[PATH:.*]]/i/Frameworks/A.framework/Frameworks/B.framework/Headers",
+// CHECKYAML-NEXT:      'contents': [
+// CHECKYAML-NEXT:        {
+// CHECKYAML-NEXT:          'type': 'file',
+// CHECKYAML-NEXT:          'name': "B.h",
+// CHECKYAML-NEXT:          'external-contents': "/[[PATH]]/i/Frameworks/B.framework/Headers/B.h"
+
+// CHECKYAML:      'type': 'directory',
+// CHECKYAML:      'name': "/[[PATH]]/i/Frameworks/B.framework/Headers",
+// CHECKYAML-NEXT:      'contents': [
+// CHECKYAML-NEXT:        {
+// CHECKYAML-NEXT:          'type': 'file',
+// CHECKYAML-NEXT:          'name': "B.h",
+// CHECKYAML-NEXT:          'external-contents': "/[[PATH]]/i/Frameworks/B.framework/Headers/B.h"
+
+@import I;
+
+// Run the reproducer script - regular exit code is enough to test it works. The
+// intent here is to guarantee that the collect umbrella headers into the VFS
+// can be used, testing that vfs::recursive_directory_iterator is used correctly
+// Make sure to erase the include paths used to build the modules to guarantee
+// that the VFS overlay won't fallback to use it. Also wipe out the module cache
+// to force header search.
+//
+// RUN: cd %t
+// RUN: rm -rf i
+// RUN: rm -rf crash-vfs-umbrella-*.cache/modules/*
+// RUN: chmod 755 crash-vfs-*.sh
+// RUN: ./crash-vfs-*.sh
diff --git a/test/Modules/cxx-templates.cpp b/test/Modules/cxx-templates.cpp
index ef4e4e420d04f..12dfdd0546eb0 100644
--- a/test/Modules/cxx-templates.cpp
+++ b/test/Modules/cxx-templates.cpp
@@ -1,9 +1,9 @@
 // RUN: rm -rf %t
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump-lookups | FileCheck %s --check-prefix=CHECK-GLOBAL
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump-lookups -ast-dump-filter N | FileCheck %s --check-prefix=CHECK-NAMESPACE-N
-// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++11 -ast-dump -ast-dump-filter SomeTemplate | FileCheck %s --check-prefix=CHECK-DUMP
-// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++11
-// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++11 -DEARLY_IMPORT
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump-lookups 2>/dev/null | FileCheck %s --check-prefix=CHECK-GLOBAL
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump-lookups -ast-dump-filter N 2>/dev/null | FileCheck %s --check-prefix=CHECK-NAMESPACE-N
+// RUN: not %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -std=c++14 -ast-dump -ast-dump-filter SomeTemplate 2>/dev/null | FileCheck %s --check-prefix=CHECK-DUMP
+// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++14
+// RUN: %clang_cc1 -x objective-c++ -fmodules -fimplicit-module-maps -fno-modules-error-recovery -fmodules-cache-path=%t -I %S/Inputs %s -verify -std=c++14 -DEARLY_IMPORT
 
 #ifdef EARLY_IMPORT
 #include "cxx-templates-textual.h"
@@ -105,7 +105,8 @@ void g() {
 
   TemplateInstantiationVisibility<char[1]> tiv1;
   TemplateInstantiationVisibility<char[2]> tiv2;
-  TemplateInstantiationVisibility<char[3]> tiv3; // expected-error 2{{must be imported from module 'cxx_templates_b_impl'}}
+  TemplateInstantiationVisibility<char[3]> tiv3; // expected-error 5{{must be imported from module 'cxx_templates_b_impl'}}
+  // expected-note@cxx-templates-b-impl.h:10 3{{explicit specialization declared here}}
   // expected-note@cxx-templates-b-impl.h:10 2{{previous definition is here}}
   TemplateInstantiationVisibility<char[4]> tiv4;
 
@@ -172,6 +173,63 @@ bool testFriendInClassTemplate(Std::WithFriend<int> wfi) {
   return wfi != wfi;
 }
 
+namespace hidden_specializations {
+  // expected-note@cxx-templates-unimported.h:* 1+{{here}}
+  void test() {
+    // For functions, uses that would trigger instantiations of definitions are
+    // not allowed.
+    fn<void>(); // ok
+    fn<char>(); // ok
+    fn<int>(); // expected-error 1+{{explicit specialization of 'fn<int>' must be imported}}
+    cls<void>::nested_fn(); // expected-error 1+{{explicit specialization of 'nested_fn' must be imported}}
+    cls<void>::nested_fn_t<int>(); // expected-error 1+{{explicit specialization of 'nested_fn_t' must be imported}}
+    cls<void>::nested_fn_t<char>(); // expected-error 1+{{explicit specialization of 'nested_fn_t' must be imported}}
+
+    // For classes, uses that would trigger instantiations of definitions are
+    // not allowed.
+    cls<void> *k0; // ok
+    cls<char> *k1; // ok
+    cls<int> *k2; // ok
+    cls<int*> *k3; // ok
+    cls<void>::nested_cls *nk1; // ok
+    cls<void>::nested_cls_t<int> *nk2; // ok
+    cls<void>::nested_cls_t<char> *nk3; // ok
+    cls<int> uk1; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+    cls<int*> uk3; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}} expected-error 1+{{definition of}}
+    cls<char*> uk4; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls unk1; // expected-error 1+{{explicit specialization of 'nested_cls' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls_t<int> unk2; // expected-error 1+{{explicit specialization of 'nested_cls_t' must be imported}} expected-error 1+{{definition of}}
+    cls<void>::nested_cls_t<char> unk3; // expected-error 1+{{explicit specialization of 'nested_cls_t' must be imported}}
+
+    // For enums, uses that would trigger instantiations of definitions are not
+    // allowed.
+    cls<void>::nested_enum e; // ok
+    (void)cls<void>::nested_enum::e; // expected-error 1+{{definition of 'nested_enum' must be imported}} expected-error 1+{{declaration of 'e'}}
+
+    // For variable template specializations, no uses are allowed because
+    // specializations can change the type.
+    (void)sizeof(var<void>); // ok
+    (void)sizeof(var<char>); // ok
+    (void)sizeof(var<int>); // expected-error 1+{{explicit specialization of 'var<int>' must be imported}}
+    (void)sizeof(var<int*>); // expected-error 1+{{partial specialization of 'var<type-parameter-0-0 *>' must be imported}}
+    (void)sizeof(var<char*>); // expected-error 1+{{partial specialization of 'var<type-parameter-0-0 *>' must be imported}}
+    (void)sizeof(cls<void>::nested_var); // ok
+    (void)cls<void>::nested_var; // expected-error 1+{{explicit specialization of 'nested_var' must be imported}}
+    (void)sizeof(cls<void>::nested_var_t<int>); // expected-error 1+{{explicit specialization of 'nested_var_t' must be imported}}
+    (void)sizeof(cls<void>::nested_var_t<char>); // expected-error 1+{{explicit specialization of 'nested_var_t' must be imported}}
+  }
+
+  void cls<int>::nested_fn() {} // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  struct cls<int>::nested_cls {}; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  int cls<int>::nested_var; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+  enum cls<int>::nested_enum : int {}; // expected-error 1+{{explicit specialization of 'cls<int>' must be imported}} expected-error 1+{{definition of}}
+
+  template<typename T> void cls<T*>::nested_fn() {} // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> struct cls<T*>::nested_cls {}; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> int cls<T*>::nested_var; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+  template<typename T> enum cls<T*>::nested_enum : int {}; // expected-error 1+{{partial specialization of 'cls<type-parameter-0-0 *>' must be imported}}
+}
+
 namespace Std {
   void g(); // expected-error {{functions that differ only in their return type cannot be overloaded}}
   // expected-note@cxx-templates-common.h:21 {{previous}}
diff --git a/test/Modules/debug-info-moduleimport.m b/test/Modules/debug-info-moduleimport.m
index bb0ea3149efbc..bf60690be40bd 100644
--- a/test/Modules/debug-info-moduleimport.m
+++ b/test/Modules/debug-info-moduleimport.m
@@ -1,10 +1,16 @@
 // RUN: rm -rf %t
-// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - | FileCheck %s --check-prefix=NOIMPORT
+
+// NOIMPORT-NOT: !DIImportedEntity
+// NOIMPORT-NOT: !DIModule
+
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s
 
 // CHECK: ![[CU:.*]] = distinct !DICompileUnit
 @import DebugObjC;
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: ![[CU]],
-// CHECK-SAME:              entity: ![[MODULE:.*]], line: 5)
+// CHECK-SAME:              entity: ![[MODULE:.*]], line: [[@LINE-2]])
 // CHECK: ![[MODULE]] = !DIModule(scope: null, name: "DebugObjC",
 // CHECK-SAME:  configMacros: "\22-DGREETING=Hello World\22 \22-UNDEBUG\22",
 // CHECK-SAME:  includePath: "{{.*}}test{{.*}}Modules{{.*}}Inputs",
diff --git a/test/Modules/embed-files-compressed.cpp b/test/Modules/embed-files-compressed.cpp
new file mode 100644
index 0000000000000..cf33a662f91f7
--- /dev/null
+++ b/test/Modules/embed-files-compressed.cpp
@@ -0,0 +1,23 @@
+// REQUIRES: zlib
+// REQUIRES: shell
+//
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo '//////////////////////////////////////////////////////////////////////' > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: cat %t/a.h %t/a.h %t/a.h %t/a.h > %t/b.h
+// RUN: cat %t/b.h %t/b.h %t/b.h %t/b.h > %t/a.h
+// RUN: echo 'module a { header "a.h" }' > %t/modulemap
+//
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-cache-path=%t -fmodule-name=a -emit-module %t/modulemap -fmodules-embed-all-files -o %t/a.pcm
+//
+// The above embeds ~4.5MB of highly-predictable /s and \ns into the pcm file.
+// Check that the resulting file is under 40KB:
+//
+// RUN: wc -c %t/a.pcm | FileCheck --check-prefix=CHECK-SIZE %s
+// CHECK-SIZE: {{(^|[^0-9])[123][0-9][0-9][0-9][0-9]($|[^0-9])}}
diff --git a/test/Modules/embed-files.cpp b/test/Modules/embed-files.cpp
index a1db21852d05a..f300558dd7f71 100644
--- a/test/Modules/embed-files.cpp
+++ b/test/Modules/embed-files.cpp
@@ -1,11 +1,17 @@
 // RUN: rm -rf %t
 // RUN: mkdir %t
-// RUN: echo 'module a { header "a.h" } module b { header "b.h" }' > %t/modulemap
+// RUN: echo 'module a { header "a.h" header "x.h" } module b { header "b.h" }' > %t/modulemap
 // RUN: echo 'extern int t;' > %t/t.h
 // RUN: echo '#include "t.h"' > %t/a.h
 // RUN: echo '#include "t.h"' > %t/b.h
+// RUN: echo '#include "t.h"' > %t/x.h
 
 // RUN: %clang_cc1 -fmodules -I%t -fmodules-cache-path=%t -fmodule-map-file=%t/modulemap -fmodules-embed-all-files %s -verify
+//
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-embed-all-files %t/modulemap -fmodule-name=a -x c++ -emit-module -o %t/a.pcm
+// RUN: %clang_cc1 -fmodules -I%t -fmodules-embed-all-files %t/modulemap -fmodule-name=b -x c++ -emit-module -o %t/b.pcm
+// RUN: rm %t/x.h
+// RUN: %clang_cc1 -fmodules -I%t -fmodule-map-file=%t/modulemap -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm %s -verify
 #include "a.h"
 char t; // expected-error {{different type}}
 // expected-note@t.h:1 {{here}}
diff --git a/test/Modules/explicit-build-flags.cpp b/test/Modules/explicit-build-flags.cpp
index 6ced215a06d1b..61300435e2cd1 100644
--- a/test/Modules/explicit-build-flags.cpp
+++ b/test/Modules/explicit-build-flags.cpp
@@ -7,8 +7,7 @@
 // Can use the module.
 // RUN: %clang_cc1 -fmodules -DFOO=1 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 
-// Can use the module if an input file is newer. (This happens on
-// remote file systems.)
+// Can use the module if an input file is newer. (This happens on remote file systems.)
 // RUN: sleep 1
 // RUN: touch %t/tmp.h
 // RUN: %clang_cc1 -fmodules -DFOO=1 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
@@ -23,6 +22,22 @@
 // Can use the module if -I flags change.
 // RUN: %clang_cc1 -fmodules -DBAR=2 -I. -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 
+// Can use the module if -fPIC/-fPIE flags change.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -pic-level 2 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+// RUN: %clang_cc1 -fmodules -DBAR=2 -pic-level 1 -pic-is-pie -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
+// Can use the module if -static flag changes.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -static-define -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
+// Can use the module if -fsanitize= flags change.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -fsanitize=address -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+//
+// RUN: %clang_cc1 -fmodules -DFOO=1 -fsanitize=address -x c++ -fmodule-name=tmp %t/map -emit-module -o %t/tmp-san.pcm
+// RUN: %clang_cc1 -fmodules -DBAR=2 -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp-san.pcm -verify -I%t %s
+
+// -fno-assume-sane-operator-new is implied by the driver -fsanitize=address flag.
+// RUN: %clang_cc1 -fmodules -DBAR=2 -fno-assume-sane-operator-new -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
+
 // Can use the module if -O flags change.
 // RUN: %clang_cc1 -fmodules -DBAR=2 -Os -x c++ -fmodule-map-file=%t/map -fmodule-file=%t/tmp.pcm -verify -I%t %s
 //
diff --git a/test/Modules/explicit-build-missing-files.cpp b/test/Modules/explicit-build-missing-files.cpp
index 1ee65d9c5e0f0..e36b5051e8319 100644
--- a/test/Modules/explicit-build-missing-files.cpp
+++ b/test/Modules/explicit-build-missing-files.cpp
@@ -3,7 +3,7 @@
 // RUN: echo 'extern int a; template<typename T> int a2 = T::error;' > %t/a.h
 // RUN: echo 'extern int b;' > %t/b.h
 // RUN: echo 'extern int c = 0;' > %t/c.h
-// RUN: echo 'module a { header "a.h" header "b.h" header "c.h" }' > %t/modulemap
+// RUN: echo 'module a { module aa { header "a.h" header "b.h" header "c.h" } }' > %t/modulemap
 // RUN: echo 'module other {}' > %t/other.modulemap
 
 // We lazily check that the files referenced by an explicitly-specified .pcm
@@ -18,7 +18,7 @@
 // RUN:            -fmodules-embed-all-files
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s
-// RUN: rm %t/modulemap
+// RUN: mv %t/modulemap %t/modulemap.moved
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s
 // RUN: rm %t/other.modulemap
@@ -32,6 +32,9 @@
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/b.pcm %s
 // RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -DERRORS 2>&1 | FileCheck %s --check-prefix=MISSING-B
+// RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved %s
+// RUN: not %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved -std=c++1z %s
+// RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm -fmodule-map-file=%t/modulemap.moved -std=c++1z -Wno-module-file-config-mismatch %s -Db=a
 // RUN: rm %t/a.h
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/a.pcm %s -verify
 // RUN: %clang_cc1 -fmodules -I %t -fmodule-file=%t/b.pcm %s -verify
diff --git a/test/Modules/explicit-build.cpp b/test/Modules/explicit-build.cpp
index 2a5b70dce6cf0..a6f6a6268c15f 100644
--- a/test/Modules/explicit-build.cpp
+++ b/test/Modules/explicit-build.cpp
@@ -143,7 +143,7 @@
 // -------------------------------
 // Try to import a PCH with -fmodule-file=
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Rmodule-build -fno-modules-error-recovery \
-// RUN:            -fmodule-name=a -emit-pch %S/Inputs/explicit-build/a.h -o %t/a.pch \
+// RUN:            -fmodule-name=a -emit-pch %S/Inputs/explicit-build/a.h -o %t/a.pch -DBUILDING_A_PCH \
 // RUN:            2>&1 | FileCheck --check-prefix=CHECK-NO-IMPLICIT-BUILD %s --allow-empty
 //
 // RUN: not %clang_cc1 -x c++ -std=c++11 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -Rmodule-build -fno-modules-error-recovery \
diff --git a/test/Modules/getSourceDescriptor-crash.cpp b/test/Modules/getSourceDescriptor-crash.cpp
new file mode 100644
index 0000000000000..84e527aef91b5
--- /dev/null
+++ b/test/Modules/getSourceDescriptor-crash.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -I %S/Inputs/getSourceDescriptor-crash -S -emit-llvm -debug-info-kind=limited -debugger-tuning=lldb -fimplicit-module-maps %s -o - | FileCheck %s
+
+#include "h1.h"
+#include "h1.h"
+
+// CHECK: DIImportedEntity
+// CHECK-SAME: entity: ![[ENTITY:[0-9]+]]
+// CHECK: ![[ENTITY]] = !DIModule
+// CHECK-SAME: name: "foo"
diff --git a/test/Modules/implementation-of-module.m b/test/Modules/implementation-of-module.m
index 37e2cfbe30fd8..712f12c56549d 100644
--- a/test/Modules/implementation-of-module.m
+++ b/test/Modules/implementation-of-module.m
@@ -1,7 +1,3 @@
-// RUN: not %clang_cc1 -fmodule-implementation-of Foo -fmodule-name=Bar %s 2>&1 \
-// RUN:     | FileCheck -check-prefix=CHECK-IMPL-OF-ERR %s
-// CHECK-IMPL-OF-ERR: conflicting module names specified: '-fmodule-name=Bar' and '-fmodule-implementation-of Foo'
-
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -w -Werror=auto-import %s -I %S/Inputs \
 // RUN:     -fmodule-implementation-of category_right -fsyntax-only
diff --git a/test/Modules/implicit-build-config-out-of-date.m b/test/Modules/implicit-build-config-out-of-date.m
new file mode 100644
index 0000000000000..c8c02ff0a8084
--- /dev/null
+++ b/test/Modules/implicit-build-config-out-of-date.m
@@ -0,0 +1,6 @@
+// RUN: rm -rf %t
+// Use -DA=0 so that there is at least one preprocessor option serialized after the diagnostic options.
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fimplicit-module-maps -I %S/Inputs %s -DA=0 -Rmodule-build -verify
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fimplicit-module-maps -I %S/Inputs %s -DA=0 -Werror -Rmodule-build -verify
+
+@import category_top; // expected-remark {{building module}} expected-remark {{finished building}}
diff --git a/test/Modules/import-self.m b/test/Modules/import-self.m
index aa74371323723..e59801592450a 100644
--- a/test/Modules/import-self.m
+++ b/test/Modules/import-self.m
@@ -6,6 +6,6 @@
 // RUN: not %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:                -I %S/Inputs/submodules -fmodule-name=import_self %s \
 // RUN:     2>&1 |  FileCheck -check-prefix=CHECK-fmodule-name %s
-// CHECK-fmodule-name: import of module 'import_self.b' appears within same top-level module 'import_self'
+// CHECK-fmodule-name: @import of module 'import_self.b' in implementation of 'import_self'
 
 @import import_self.b;
diff --git a/test/Modules/include-own-headers.m b/test/Modules/include-own-headers.m
new file mode 100644
index 0000000000000..a5a85312ec3d0
--- /dev/null
+++ b/test/Modules/include-own-headers.m
@@ -0,0 +1,4 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodule-name=Module -fimplicit-module-maps -fmodules-cache-path=%t -Werror=non-modular-include-in-framework-module -F%S/Inputs -I%S -fsyntax-only %s
+#include "Module/Module.h"
+#include "Inputs/non-module.h"
diff --git a/test/Modules/merge-decl-context.cpp b/test/Modules/merge-decl-context.cpp
index 55219ed587b0d..5dbf3d1bd95f6 100644
--- a/test/Modules/merge-decl-context.cpp
+++ b/test/Modules/merge-decl-context.cpp
@@ -18,7 +18,13 @@
 // RUN:     -fmodule-map-file=%S/Inputs/merge-decl-context/merge-decl-context.modulemap -I%S/Inputs \
 // RUN:     -emit-llvm -o %t/test.o %s
 
+// RUN: %clang_cc1 -x c++ -std=c++11 -fmodules -fmodules-cache-path=%t \
+// RUN:     -fmodule-map-file=%S/Inputs/merge-decl-context/merge-decl-context.modulemap -I%S/Inputs \
+// RUN:     -emit-llvm -o %t/test.o -DNO_TEXTUAL_INCLUSION %s
+
+#ifndef NO_TEXTUAL_INCLUSION
 #include "Inputs/merge-decl-context/a.h"
+#endif
 #include "Inputs/merge-decl-context/b.h"
 #include "Inputs/merge-decl-context/c.h"
 #include "Inputs/merge-decl-context/d.h"
@@ -26,3 +32,5 @@
 void t() {
   ff(42);
 }
+
+static_assert(Aggregate{.member = 1}.member == 1, "");
diff --git a/test/Modules/method_pool_write.m b/test/Modules/method_pool_write.m
new file mode 100644
index 0000000000000..b7f8ac64b4d3e
--- /dev/null
+++ b/test/Modules/method_pool_write.m
@@ -0,0 +1,11 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -fsyntax-only -I %S/Inputs %s -verify
+// expected-no-diagnostics
+
+@import MethodPoolCombined;
+@import MethodPoolString2;
+
+void message_kindof_object(__kindof S2 *kindof_S2) {
+  [kindof_S2 stringValue];
+}
+
diff --git a/test/Modules/minimal-identifier-tables.cpp b/test/Modules/minimal-identifier-tables.cpp
new file mode 100644
index 0000000000000..0674746e07fec
--- /dev/null
+++ b/test/Modules/minimal-identifier-tables.cpp
@@ -0,0 +1,10 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: echo 'extern int some_long_variable_name;' > %t/x.h
+// RUN: echo 'extern int some_long_variable_name;' > %t/y.h
+// RUN: echo 'module X { header "x.h" } module Y { header "y.h" }' > %t/map
+// RUN: %clang_cc1 -fmodules -x c++ -fmodule-name=X %t/map -emit-module -o %t/x.pcm
+// RUN: %clang_cc1 -fmodules -x c++ -fmodule-name=Y %t/map -fmodule-file=%t/x.pcm -emit-module -o %t/y.pcm
+// RUN: cat %t/y.pcm | FileCheck %s
+//
+// CHECK-NOT: some_long_variable_name
diff --git a/test/Modules/no-implicit-builds.cpp b/test/Modules/no-implicit-builds.cpp
index 374ed5e4181ef..fa4d5cb3c5308 100644
--- a/test/Modules/no-implicit-builds.cpp
+++ b/test/Modules/no-implicit-builds.cpp
@@ -1,5 +1,9 @@
 // RUN: rm -rf %t
 
+// RUN: %clang -x c++ -std=c++11 -fmodules -fno-implicit-modules /dev/null -### \
+// RUN:     2>&1 | FileCheck --check-prefix=CHECK-NO-MODULE-CACHE %s
+// CHECK-NO-MODULE-CACHE-NOT: -fmodules-cache-path
+
 // Produce an error if a module is needed, but not found.
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \
 // RUN:     -fmodule-map-file=%S/Inputs/no-implicit-builds/b.modulemap \
diff --git a/test/Modules/objc-categories.m b/test/Modules/objc-categories.m
index e8549fabb50ee..42baf352fbf59 100644
--- a/test/Modules/objc-categories.m
+++ b/test/Modules/objc-categories.m
@@ -9,7 +9,7 @@
 @import category_bottom;
 
 // expected-note@Inputs/category_left.h:14 {{previous definition}}
-// expected-warning@Inputs/category_right.h:11 {{duplicate definition of category}}
+// expected-warning@Inputs/category_right.h:12 {{duplicate definition of category}}
 // expected-note@Inputs/category_top.h:1 {{receiver is instance of class declared here}}
 
 @interface Foo(Source)
diff --git a/test/Modules/parse-attributes.modulemap b/test/Modules/parse-attributes.modulemap
new file mode 100644
index 0000000000000..0d18325580a7a
--- /dev/null
+++ b/test/Modules/parse-attributes.modulemap
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t.modules
+// RUN: not %clang_cc1 -fmodules -fmodules-cache-path=%t.modules \
+// RUN:   -fmodule-map-file=%s -I%S -include "Inputs/empty.h" \
+// RUN:   -fsyntax-only -x c++ /dev/null 2>&1 | FileCheck %s
+
+// CHECK: error: expected ']' to close attribute
+// CHECK-NOT: error: expected '{' to start module 'A'
+
+module A [system {
+  header "Inputs/empty.h"
+  private header "Inputs/empty.h"
+}
diff --git a/test/Modules/pch-module-macro.m b/test/Modules/pch-module-macro.m
new file mode 100644
index 0000000000000..cc9f687374516
--- /dev/null
+++ b/test/Modules/pch-module-macro.m
@@ -0,0 +1,9 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -emit-pch -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -o %t.pch -I %S/Inputs -x objective-c-header %S/Inputs/pch-import-module-with-macro.pch
+// RUN: %clang_cc1 -fmodules-cache-path=%t -fmodules -fimplicit-module-maps -fsyntax-only -I %S/Inputs -include-pch %t.pch %s -verify
+// expected-no-diagnostics
+
+int test(int x) {
+  return my_fabs(x) + fabs(x);
+}
+
diff --git a/test/Modules/pr21547.cpp b/test/Modules/pr21547.cpp
new file mode 100644
index 0000000000000..c6275b4cc7f11
--- /dev/null
+++ b/test/Modules/pr21547.cpp
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR21547 -verify %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/PR21547 -verify %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/PR21547 -emit-llvm-only %s
+
+#include "Inputs/PR21547/FirstHeader.h"
+
+//expected-no-diagnostics
diff --git a/test/Modules/pr24954.cpp b/test/Modules/pr24954.cpp
new file mode 100644
index 0000000000000..407ee06e40250
--- /dev/null
+++ b/test/Modules/pr24954.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR24954 -verify %s
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I%S/Inputs/PR24954 -verify %s
+
+#include "A.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr25501.cpp b/test/Modules/pr25501.cpp
new file mode 100644
index 0000000000000..18002d6dff3fa
--- /dev/null
+++ b/test/Modules/pr25501.cpp
@@ -0,0 +1,9 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR25501/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR25501 -verify %s
+
+#include "a2.h"
+#include "b.h"
+
+auto use = aaa;
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr26014.cpp b/test/Modules/pr26014.cpp
new file mode 100644
index 0000000000000..f9ebd4e1dd7a5
--- /dev/null
+++ b/test/Modules/pr26014.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR26014 -verify %s
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=%S/Inputs/PR26014/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR26014 -verify %s
+
+#include "A.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr26179.cpp b/test/Modules/pr26179.cpp
new file mode 100644
index 0000000000000..f25f1ce24bdaf
--- /dev/null
+++ b/test/Modules/pr26179.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -I%S/Inputs/PR26179 -verify %s
+// RUN: %clang_cc1 -fmodules -fmodule-map-file=%S/Inputs/PR26179/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR26179 -verify %s
+
+#include "A.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27041.cpp b/test/Modules/pr27041.cpp
new file mode 100644
index 0000000000000..9d06468b97bdc
--- /dev/null
+++ b/test/Modules/pr27041.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27041 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27041/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27041 -verify %s
+
+#include "Rtypes.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27186.cpp b/test/Modules/pr27186.cpp
new file mode 100644
index 0000000000000..02a8fe5b00e8a
--- /dev/null
+++ b/test/Modules/pr27186.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27186  -I%S/Inputs/PR27186/subdir/ -verify %s
+// RUN: %clang_cc1  -nostdsysteminc -std=c++11 -fmodules  -fmodule-map-file=%S/Inputs/PR27186/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27186/ -verify %s
+
+#include "Rtypes.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27401.cpp b/test/Modules/pr27401.cpp
new file mode 100644
index 0000000000000..7d5479cb92434
--- /dev/null
+++ b/test/Modules/pr27401.cpp
@@ -0,0 +1,38 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27401 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27401/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27401 -verify %s
+
+#include "a.h"
+#define _LIBCPP_VECTOR
+template <class, class _Allocator>
+class __vector_base {
+protected:
+  _Allocator __alloc() const;
+  __vector_base(_Allocator);
+};
+
+template <class _Tp, class _Allocator = allocator>
+class vector : __vector_base<_Tp, _Allocator> {
+public:
+  vector() noexcept(is_nothrow_default_constructible<_Allocator>::value);
+  vector(const vector &);
+  vector(vector &&)
+      noexcept(is_nothrow_move_constructible<_Allocator>::value);
+};
+
+template <class _Tp, class _Allocator>
+vector<_Tp, _Allocator>::vector(const vector &__x) : __vector_base<_Tp, _Allocator>(__x.__alloc()) {}
+
+  struct CommentOptions {
+    vector<char>  ParseAllComments;
+    CommentOptions() {}
+  };
+  struct PrintingPolicy {
+    PrintingPolicy(CommentOptions LO) : LangOpts(LO) {}
+    CommentOptions LangOpts;
+  };
+
+#include "b.h"
+CommentOptions fn1() { return fn1(); }
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27513.cpp b/test/Modules/pr27513.cpp
new file mode 100644
index 0000000000000..28fbe5bd82551
--- /dev/null
+++ b/test/Modules/pr27513.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27513 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27513/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27513 -verify %s
+
+#include "Inputs/PR27513/a.h"
+
+//expected-no-diagnostics
diff --git a/test/Modules/pr27739.cpp b/test/Modules/pr27739.cpp
new file mode 100644
index 0000000000000..b27dc1b09371f
--- /dev/null
+++ b/test/Modules/pr27739.cpp
@@ -0,0 +1,12 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -internal-externc-isystem %S/Inputs/PR27739 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27739/module.modulemap -fmodules-cache-path=%t -internal-externc-isystem %S/Inputs/PR27739/ -verify %s
+
+#include "DataInputHandler.h"
+
+void DataInputHandler::AddTree() {
+   fInputTrees[(char*)""];
+   fExplicitTrainTest[(char*)""];
+}
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27754.cpp b/test/Modules/pr27754.cpp
new file mode 100644
index 0000000000000..0482595429484
--- /dev/null
+++ b/test/Modules/pr27754.cpp
@@ -0,0 +1,7 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27754 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27754/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27754/ -verify %s
+
+#include "TMetaUtils.h"
+
+// expected-no-diagnostics
diff --git a/test/Modules/pr27890.cpp b/test/Modules/pr27890.cpp
new file mode 100644
index 0000000000000..8bb9a9fd5c917
--- /dev/null
+++ b/test/Modules/pr27890.cpp
@@ -0,0 +1,9 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -std=c++11 -I%S/Inputs/PR27890 -verify %s
+// RUN: %clang_cc1 -std=c++11 -fmodules -fmodule-map-file=%S/Inputs/PR27890/module.modulemap -fmodules-cache-path=%t -I%S/Inputs/PR27890 -verify %s
+
+#include "a.h"
+enum ActionType {};
+opt<ActionType> a(values(""));
+
+// expected-no-diagnostics
+\ No newline at end of file
diff --git a/test/Modules/preprocess.cpp b/test/Modules/preprocess.cpp
new file mode 100644
index 0000000000000..0615331c8bd79
--- /dev/null
+++ b/test/Modules/preprocess.cpp
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x c++ -E %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=CXX --check-prefix=CXX-DASHE
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x objective-c -E %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=OBJC
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x c++ -E -frewrite-includes %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=CXX
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x objective-c -E -frewrite-includes %s | \
+// RUN:   FileCheck -strict-whitespace %s --check-prefix=CHECK --check-prefix=OBJC
+#include "dummy.h"
+#include "dummy.h"
+foo bar baz
+
+// The weird {{ }} here is to prevent the -frewrite-includes test from matching its own CHECK lines.
+
+// CXX: #include{{ }}"dummy.h"
+// CXX-DASHE-SAME: /* clang -E: implicit import for module dummy */
+// CXX: #include{{ }}"dummy.h"
+// CXX-DASHE-SAME: /* clang -E: implicit import for module dummy */
+// CXX: foo bar baz
+
+// OBJC: @import{{ }}dummy; /* clang 
+// OBJC: @import{{ }}dummy; /* clang 
+// OBJC: foo bar baz
diff --git a/test/Modules/submodules-merge-defs.cpp b/test/Modules/submodules-merge-defs.cpp
index 23d1f5cfb12b5..4ab822a022bd8 100644
--- a/test/Modules/submodules-merge-defs.cpp
+++ b/test/Modules/submodules-merge-defs.cpp
@@ -58,6 +58,11 @@ G::A pre_ga // expected-error +{{must be imported}}
 decltype(G::h) pre_gh = G::h; // expected-error +{{must be imported}}
 // expected-note@defs.h:51 +{{here}}
 
+int pre_h = H(); // expected-error +{{must be imported}}
+// expected-note@defs.h:56 +{{here}}
+using pre_i = I<>; // expected-error +{{must be imported}}
+// expected-note@defs.h:57 +{{here}}
+
 J<> pre_j; // expected-error {{declaration of 'J' must be imported}}
 #ifdef IMPORT_USE_2
 // expected-error-re@-2 {{default argument of 'J' must be imported from one of {{.*}}stuff.use{{.*}}stuff.use-2}}
@@ -99,6 +104,8 @@ int post_ff = F<char>().f();
 int post_fg = F<char>().g<int>();
 G::A post_ga = G::a;
 decltype(G::h) post_gh = G::h;
+int post_h = H();
+using post_i = I<>;
 J<> post_j;
 template<typename T, int N, template<typename> class K> struct J;
 J<> post_j2;
diff --git a/test/Modules/suggest-include.cpp b/test/Modules/suggest-include.cpp
new file mode 100644
index 0000000000000..e10c3f38aba28
--- /dev/null
+++ b/test/Modules/suggest-include.cpp
@@ -0,0 +1,33 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fmodules-cache-path=%t -fimplicit-module-maps -I%S/Inputs/suggest-include %s -verify
+
+#include "empty.h" // import the module file
+
+// expected-note@usetextual1.h:2 {{previous}}
+// expected-note@textual2.h:1 {{previous}}
+// expected-note@textual3.h:1 {{previous}}
+// expected-note@textual4.h:1 {{previous}}
+// expected-note@textual5.h:1 {{previous}}
+// expected-note@private1.h:1 {{previous}}
+// expected-note@private2.h:1 {{previous}}
+// expected-note@private3.h:1 {{previous}}
+
+void f() {
+  (void)::usetextual1; // expected-error {{missing '#include "usetextual1.h"'}}
+  (void)::usetextual2; // expected-error {{missing '#include "usetextual2.h"'}}
+  (void)::textual3; // expected-error-re {{{{^}}missing '#include "usetextual3.h"'}}
+  // Don't suggest a #include that includes the entity via a path that leaves
+  // the module. In that case we can't be sure that we've picked the right header.
+  (void)::textual4; // expected-error-re {{{{^}}declaration of 'textual4'}}
+  (void)::textual5; // expected-error-re {{{{^}}declaration of 'textual5'}}
+
+  // Don't suggest #including a private header.
+  // FIXME: We could suggest including "useprivate1.h" here, as it's the only
+  // public way to get at this declaration.
+  (void)::private1; // expected-error-re {{{{^}}declaration of 'private1'}}
+  // FIXME: Should we be suggesting an import at all here? Should declarations
+  // in private headers be visible when the surrounding module is imported?
+  (void)::private2; // expected-error-re {{{{^}}declaration of 'private2'}}
+  // Even if we suggest an include for private1, we should not do so here.
+  (void)::private3; // expected-error-re {{{{^}}declaration of 'private3'}}
+}
diff --git a/test/Modules/typo.m b/test/Modules/typo.m
new file mode 100644
index 0000000000000..7e5108df6bede
--- /dev/null
+++ b/test/Modules/typo.m
@@ -0,0 +1,8 @@
+// RUN: rm -rf %t
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -x objective-c-header %S/Inputs/typo.h -emit-pch -o %t.pch
+// RUN: %clang_cc1 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -I %S/Inputs -include-pch %t.pch %s -verify
+
+void test() {
+  [Nsstring alloc]; // expected-error {{unknown receiver 'Nsstring'; did you mean 'NSString'}}
+                    // expected-note@typo.h:* {{here}}
+}
diff --git a/test/Modules/use-after-free.m b/test/Modules/use-after-free.m
new file mode 100644
index 0000000000000..fa7200933afd6
--- /dev/null
+++ b/test/Modules/use-after-free.m
@@ -0,0 +1,24 @@
+// RUN: rm -rf %t
+
+// Here, we build the module without "non-modular-include-in-framework-module".
+// RUN: echo '@import UseAfterFreePublic;' | \
+// RUN:   %clang_cc1 -fmodules -fimplicit-module-maps \
+// RUN:     -fmodules-cache-path=%t -isystem %S/Inputs/UseAfterFree/ -fsyntax-only \
+// RUN:     -x objective-c -
+
+// RUN:   %clang_cc1 -fmodules -fimplicit-module-maps \
+// RUN:     -fmodules-cache-path=%t -isystem %S/Inputs/UseAfterFree/ -fsyntax-only \
+// RUN:     -Wnon-modular-include-in-framework-module -Werror=non-modular-include-in-framework-module \
+// RUN:     -x objective-c %s -verify
+// expected-no-diagnostics
+
+// Here, we load the module UseAfterFreePublic, it is treated as a system module,
+// we ignore the inconsistency for "non-modular-include-in-framework-module".
+@import UseAfterFreePublic;
+
+// We start a thread to build the module for UseAfterFreePrivate.h. In the thread,
+// we load UseAfterFreePublic and should treat it as a system module as well.
+// If not, we will invalidate UseAfterFreePublic because of the inconsistency
+// for "non-modular-include-in-framework-module", and have a use-after-free error
+// of the FileEntry.
+#import <UseAfterFreePrivate.h>
diff --git a/test/OpenMP/critical_codegen.cpp b/test/OpenMP/critical_codegen.cpp
index e44e2202e9a2e..be749a65f0cb7 100644
--- a/test/OpenMP/critical_codegen.cpp
+++ b/test/OpenMP/critical_codegen.cpp
@@ -39,7 +39,11 @@ int main() {
 #pragma omp critical(the_name1) hint(23)
   foo();
 // CHECK:       call {{.*}}void @__kmpc_critical([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], [8 x i32]* [[THE_NAME_LOCK]])
+// CHECK:       br label
 // CHECK-NOT:   call {{.*}}void @__kmpc_end_critical(
+// CHECK:       br label
+// CHECK-NOT:   call {{.*}}void @__kmpc_end_critical(
+// CHECK:       br label
   if (a)
 #pragma omp critical(the_name)
     while (1)
@@ -60,6 +64,8 @@ void critical_ref(S &s) {
   // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
   // CHECK: [[S_A_REF:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[S_REF]], i32 0, i32 0
   ++s.a;
+  // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
+  // CHECK: store %struct.S* [[S_REF]], %struct.S** [[S_ADDR:%.+]],
   // CHECK: call void @__kmpc_critical(
 #pragma omp critical
   // CHECK: [[S_REF:%.+]] = load %struct.S*, %struct.S** [[S_ADDR]],
diff --git a/test/OpenMP/declare_reduction_ast_print.c b/test/OpenMP/declare_reduction_ast_print.c
new file mode 100644
index 0000000000000..7b97a7c1da7c3
--- /dev/null
+++ b/test/OpenMP/declare_reduction_ast_print.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+// CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+// CHECK: #pragma omp declare reduction (fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+// CHECK: struct SSS {
+struct SSS {
+  int field;
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+  // CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+  // CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+};
+// CHECK: };
+
+void init(struct SSS *priv, struct SSS orig);
+
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+// CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+
+// CHECK: int main() {
+int main() {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: #pragma omp declare reduction (fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  }
+  return 0;
+}
+// CHECK: }
+
+#endif
diff --git a/test/OpenMP/declare_reduction_ast_print.cpp b/test/OpenMP/declare_reduction_ast_print.cpp
new file mode 100644
index 0000000000000..26b2ff96ac022
--- /dev/null
+++ b/test/OpenMP/declare_reduction_ast_print.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: #pragma omp declare reduction (+ : int : omp_out *= omp_in)
+// CHECK-NEXT: #pragma omp declare reduction (+ : char : omp_out *= omp_in)
+
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+template <class T>
+class SSS {
+public:
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  // CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+};
+
+SSS<int> d;
+
+void init(SSS<int> &lhs, SSS<int> rhs);
+
+#pragma omp declare reduction(fun : SSS < int > : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+// CHECK: #pragma omp declare reduction (fun : SSS<int> : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+
+// CHECK: template <typename T = int> int foo(int a) {
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: {
+// CHECK: #pragma omp declare reduction (fun : int : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: }
+// CHECK: return a;
+// CHECK: }
+
+// CHECK: template <typename T> T foo(T a) {
+// CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: {
+// CHECK: #pragma omp declare reduction (fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15);
+// CHECK: }
+// CHECK: return a;
+// CHECK: }
+template <typename T>
+T foo(T a) {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+  }
+  return a;
+}
+
+int main() {
+  int i = 0;
+  SSS<int> sss;
+  // TODO: Add support for scoped reduction identifiers
+  //  #pragma omp parallel reduction(SSS<int>::fun : i)
+  // TODO-CHECK: #pragma omp parallel reduction(SSS<int>::fun: i)
+  {
+    i += 1;
+  }
+  // #pragma omp parallel reduction(::fun:sss)
+  // TODO-CHECK: #pragma omp parallel reduction(::fun: sss)
+  {
+  }
+  return foo(15);
+}
+
+#endif
diff --git a/test/OpenMP/declare_reduction_codegen.c b/test/OpenMP/declare_reduction_codegen.c
new file mode 100644
index 0000000000000..ecb97269d4db5
--- /dev/null
+++ b/test/OpenMP/declare_reduction_codegen.c
@@ -0,0 +1,158 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c -emit-llvm %s -triple %itanium_abi_triple -o - -femit-all-decls -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c -triple %itanium_abi_triple -emit-pch -o %t %s -femit-all-decls -disable-llvm-optzns
+// RUN: %clang_cc1 -fopenmp -x c -triple %itanium_abi_triple -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-optzns | FileCheck --check-prefix=CHECK-LOAD %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[SSS_INT:.+]] = type { i32 }
+// CHECK-LOAD: [[SSS_INT:.+]] = type { i32 }
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK: sext i8
+// CHECK: sext i8
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = 15 + omp_orig)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK: [[ADD:%.+]] = fadd float
+// CHECK-NEXT: store float [[ADD]], float*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK: [[ADD:%.+]] = fadd float 1.5
+// CHECK-NEXT: store float [[ADD]], float*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = fadd float
+// CHECK-LOAD-NEXT: store float [[ADD]], float*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(float* noalias, float* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = fadd float 1.5
+// CHECK-LOAD-NEXT: store float [[ADD]], float*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+struct SSS {
+  int field;
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+  // CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+  // CHECK: [[MUL:%.+]] = mul nsw i32
+  // CHECK-NEXT: store i32 [[MUL]], i32*
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+
+  // CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+  // CHECK: sext i8
+  // CHECK: sext i8
+  // CHECK: [[MUL:%.+]] = mul nsw i32
+  // CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+  // CHECK-NEXT: store i8 [[TRUNC]], i8*
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+};
+
+void init(struct SSS *priv, struct SSS orig);
+
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @llvm.memcpy
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @init(
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @llvm.memcpy
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @init(
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LABEL: @main
+// CHECK-LOAD-LABEL: @main
+int main() {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+  // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK: call void @llvm.memcpy
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK: call void @init(
+  // CHECK-NEXT: ret void
+  // CHECK-NEXT: }
+  // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK-LOAD: call void @llvm.memcpy
+  // CHECK-LOAD-NEXT: ret void
+  // CHECK-LOAD-NEXT: }
+  // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+  // CHECK-LOAD: call void @init(
+  // CHECK-LOAD-NEXT: ret void
+  // CHECK-LOAD-NEXT: }
+  {
+#pragma omp declare reduction(fun : struct SSS : omp_out = omp_in) initializer(init(&omp_priv, omp_orig))
+    // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK: call void @llvm.memcpy
+    // CHECK-NEXT: ret void
+    // CHECK-NEXT: }
+    // CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK: call void @init(
+    // CHECK-NEXT: ret void
+    // CHECK-NEXT: }
+    // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK-LOAD: call void @llvm.memcpy
+    // CHECK-LOAD-NEXT: ret void
+    // CHECK-LOAD-NEXT: }
+    // CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+    // CHECK-LOAD: call void @init(
+    // CHECK-LOAD-NEXT: ret void
+    // CHECK-LOAD-NEXT: }
+  }
+  return 0;
+}
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+#endif
diff --git a/test/OpenMP/declare_reduction_codegen.cpp b/test/OpenMP/declare_reduction_codegen.cpp
new file mode 100644
index 0000000000000..a18e73f8a10f7
--- /dev/null
+++ b/test/OpenMP/declare_reduction_codegen.cpp
@@ -0,0 +1,182 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -emit-llvm %s -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -o - -femit-all-decls -disable-llvm-optzns | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -emit-pch -o %t %s -femit-all-decls -disable-llvm-optzns
+// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls -disable-llvm-optzns | FileCheck --check-prefix=CHECK-LOAD %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[SSS_INT:.+]] = type { i32 }
+// CHECK-LOAD: [[SSS_INT:.+]] = type { i32 }
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK: sext i8
+// CHECK: sext i8
+// CHECK: [[MUL:%.+]] = mul nsw i32
+// CHECK-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i8* noalias, i8* noalias)
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: sext i8
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32
+// CHECK-LOAD-NEXT: [[TRUNC:%.+]] = trunc i32 [[MUL]] to i8
+// CHECK-LOAD-NEXT: store i8 [[TRUNC]], i8*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+template <class T>
+struct SSS {
+  T a;
+  SSS() : a() {}
+#pragma omp declare reduction(fun : T : omp_out ^= omp_in) initializer(omp_priv = 24 + omp_orig)
+};
+
+SSS<int> d;
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[XOR:%.+]] = xor i32
+// CHECK-NEXT: store i32 [[XOR]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[ADD:%.+]] = add nsw i32 24,
+// CHECK-NEXT: store i32 [[ADD]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
+// CHECK-LOAD: define {{.*}}void [[INIT:@[^(]+]]([[SSS_INT]]*
+void init(SSS<int> &lhs, SSS<int> &rhs) {}
+
+#pragma omp declare reduction(fun : SSS < int > : omp_out = omp_in) initializer(init(omp_priv, omp_orig))
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call void @llvm.memcpy
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK: call {{.*}}void [[INIT]](
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call void @llvm.memcpy
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}([[SSS_INT]]* noalias, [[SSS_INT]]* noalias)
+// CHECK-LOAD: call {{.*}}void [[INIT]](
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+template <typename T>
+T foo(T a) {
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = 15 * omp_orig)
+  {
+#pragma omp declare reduction(fun : T : omp_out /= omp_in) initializer(omp_priv = 11 - omp_orig)
+  }
+  return a;
+}
+
+// CHECK-LABEL: @main
+int main() {
+  int i = 0;
+  SSS<int> sss;
+#pragma omp parallel reduction(SSS < int > ::fun : i)
+  {
+    i += 1;
+  }
+#pragma omp parallel reduction(::fun : sss)
+  {
+  }
+#pragma omp declare reduction(fun : SSS < int > : init(omp_out, omp_in))
+#pragma omp parallel reduction(fun : sss)
+  {
+  }
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(
+  // CHECK: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call({{[^@]*}} @{{[^@]*}}[[REGION:@[^ ]+]]
+  // CHECK-LABEL: foo
+  return foo(15);
+}
+
+// CHECK: define internal {{.*}}void [[REGION]](
+// CHECK: [[SSS_PRIV:%.+]] = alloca %struct.SSS,
+// CHECK: invoke {{.*}} @_ZN3SSSIiEC1Ev(%struct.SSS* [[SSS_PRIV]])
+// CHECK-NOT: {{call |invoke }}
+// CHECK: call {{.*}}i32 @__kmpc_reduce_nowait(
+
+// CHECK-LABEL: i32 @{{.+}}foo{{[^(].+}}(i32
+// CHECK-LOAD-LABEL: i32 @{{.+}}foo{{[^(].+}}(i32
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[XOR:%.+]] = xor i32
+// CHECK-LOAD-NEXT: store i32 [[XOR]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = add nsw i32 24,
+// CHECK-LOAD-NEXT: store i32 [[ADD]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[ADD:%.+]] = add nsw i32
+// CHECK-NEXT: store i32 [[ADD]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[ADD:%.+]] = add nsw i32
+// CHECK-LOAD-NEXT: store i32 [[ADD]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[MUL:%.+]] = mul nsw i32 15,
+// CHECK-NEXT: store i32 [[MUL]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[MUL:%.+]] = mul nsw i32 15,
+// CHECK-LOAD-NEXT: store i32 [[MUL]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[DIV:%.+]] = sdiv i32
+// CHECK-NEXT: store i32 [[DIV]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[DIV:%.+]] = sdiv i32
+// CHECK-LOAD-NEXT: store i32 [[DIV]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+// CHECK: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK: [[SUB:%.+]] = sub nsw i32 11,
+// CHECK-NEXT: store i32 [[SUB]], i32*
+// CHECK-NEXT: ret void
+// CHECK-NEXT: }
+// CHECK-LOAD: define internal {{.*}}void @{{[^(]+}}(i32* noalias, i32* noalias)
+// CHECK-LOAD: [[SUB:%.+]] = sub nsw i32 11,
+// CHECK-LOAD-NEXT: store i32 [[SUB]], i32*
+// CHECK-LOAD-NEXT: ret void
+// CHECK-LOAD-NEXT: }
+
+#endif
diff --git a/test/OpenMP/declare_reduction_messages.c b/test/OpenMP/declare_reduction_messages.c
new file mode 100644
index 0000000000000..fb9eacc90aa26
--- /dev/null
+++ b/test/OpenMP/declare_reduction_messages.c
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int temp; // expected-note 6 {{'temp' declared here}}
+
+#pragma omp declare reduction                                              // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction {                                            // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction(                                             // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(#                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(/                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(+                                            // expected-error {{expected ':'}}
+#pragma omp declare reduction(for                                          // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(if:                                          // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}} expected-error {{expected a type}}
+#pragma omp declare reduction(oper:                                        // expected-error {{expected a type}}
+#pragma omp declare reduction(oper;                                        // expected-error {{expected ':'}} expected-error {{expected a type}}
+#pragma omp declare reduction(fun : int                                    // expected-error {{expected ':'}} expected-error {{expected expression}}
+#pragma omp declare reduction(+ : const int:                               // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(- : volatile int:                            // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(* : int;                                     // expected-error {{expected ','}} expected-error {{expected a type}}
+#pragma omp declare reduction(& : double char:                             // expected-error {{cannot combine with previous 'double' declaration specifier}} expected-error {{expected expression}}
+#pragma omp declare reduction(^ : double, char, :                          // expected-error {{expected a type}} expected-error {{expected expression}}
+#pragma omp declare reduction(&& : int, S:                                 // expected-error {{unknown type name 'S'}} expected-error {{expected expression}}
+#pragma omp declare reduction(|| : int, double : temp += omp_in)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(| : char, float : omp_out += temp)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long : omp_out += omp_in) {            // expected-error {{expected 'initializer'}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun : unsigned : omp_out += temp))           // expected-error {{expected 'initializer'}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long(void) : omp_out += omp_in)        // expected-error {{reduction type cannot be a function type}}
+#pragma omp declare reduction(fun : long[3] : omp_out += omp_in)           // expected-error {{reduction type cannot be an array type}}
+#pragma omp declare reduction(fun23 : long, int, long : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'long'}} expected-note {{previous definition is here}}
+
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)
+#pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer                 // expected-error {{expected '(' after 'initializer'}}
+#pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer {               // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[                // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer()               // expected-error {{expected expression}}
+#pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp)           // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}}
+#pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig        // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun7 : long : omp_out += omp_in) initializer(omp_priv 12)    // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv = 23)  // expected-note {{previous definition is here}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv = 23)) // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{redefinition of user-defined reduction for type 'long'}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_in) initializer(omp_priv = )    // expected-error {{expected expression}}
+
+int fun(int arg) {
+#pragma omp declare reduction(red : int : omp_out++)
+  {
+#pragma omp declare reduction(red : int : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : int : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+    {
+#pragma omp declare reduction(red : int : omp_out++)
+    }
+  }
+  return arg;
+}
diff --git a/test/OpenMP/declare_reduction_messages.cpp b/test/OpenMP/declare_reduction_messages.cpp
new file mode 100644
index 0000000000000..a1373b1dcb9bd
--- /dev/null
+++ b/test/OpenMP/declare_reduction_messages.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int temp; // expected-note 7 {{'temp' declared here}}
+
+#pragma omp declare reduction                                              // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction {                                            // expected-error {{expected '(' after 'declare reduction'}}
+#pragma omp declare reduction(                                             // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(#                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(/                                            // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(+                                            // expected-error {{expected ':'}}
+#pragma omp declare reduction(operator                                     // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}}
+#pragma omp declare reduction(operator:                                    // expected-error {{expected identifier or one of the following operators: '+', '-', '*', '&', '|', '^', '&&', or '||'}} expected-error {{expected a type}}
+#pragma omp declare reduction(oper:                                        // expected-error {{expected a type}}
+#pragma omp declare reduction(oper;                                        // expected-error {{expected ':'}} expected-error {{expected a type}}
+#pragma omp declare reduction(fun : int                                    // expected-error {{expected ':'}} expected-error {{expected expression}}
+#pragma omp declare reduction(+ : const int:                               // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(- : volatile int:                            // expected-error {{reduction type cannot be qualified with 'const', 'volatile' or 'restrict'}}
+#pragma omp declare reduction(* : int;                                     // expected-error {{expected ','}} expected-error {{expected a type}}
+#pragma omp declare reduction(& : double char:                             // expected-error {{cannot combine with previous 'double' declaration specifier}} expected-error {{expected expression}}
+#pragma omp declare reduction(^ : double, char, :                          // expected-error {{expected a type}} expected-error {{expected expression}}
+#pragma omp declare reduction(&& : int, S:                                 // expected-error {{unknown type name 'S'}} expected-error {{expected expression}}
+#pragma omp declare reduction(|| : int, double : temp += omp_in)           // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(| : char, float : omp_out += ::temp)         // expected-error 2 {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long : omp_out += omp_in) {            // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{expected 'initializer'}}
+#pragma omp declare reduction(fun : unsigned : omp_out += ::temp))         // expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}} expected-error {{expected 'initializer'}} expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun : long & : omp_out += omp_in)            // expected-error {{reduction type cannot be a reference type}}
+#pragma omp declare reduction(fun : long(void) : omp_out += omp_in)        // expected-error {{reduction type cannot be a function type}}
+#pragma omp declare reduction(fun : long[3] : omp_out += omp_in)           // expected-error {{reduction type cannot be an array type}}
+#pragma omp declare reduction(fun23 : long, int, long : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'long'}} expected-note {{previous definition is here}}
+
+template <class T>
+class Class1 {
+ T a;
+public:
+  Class1() : a() {}
+#pragma omp declare reduction(fun : T : temp)               // expected-error {{only 'omp_in' or 'omp_out' variables are allowed in combiner expression}}
+#pragma omp declare reduction(fun1 : T : omp_out++)         // expected-note {{previous definition is here}} expected-error {{reduction type cannot be a reference type}}
+#pragma omp declare reduction(fun1 : T : omp_out += omp_in) // expected-error {{redefinition of user-defined reduction for type 'T'}}
+#pragma omp declare reduction(fun2 : T, T : omp_out++)      // expected-error {{reduction type cannot be a reference type}} expected-error {{redefinition of user-defined reduction for type 'T'}} expected-note {{previous definition is here}}
+#pragma omp declare reduction(foo : T : omp_out += this->a) // expected-error {{invalid use of 'this' outside of a non-static member function}}
+};
+
+Class1<char &> e; // expected-note {{in instantiation of template class 'Class1<char &>' requested here}}
+
+template <class T>
+class Class2 : public Class1<T> {
+#pragma omp declare reduction(fun : T : omp_out += omp_in)
+};
+
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)                                        // expected-note {{previous definition is here}}
+#pragma omp declare reduction(fun222 : long : omp_out += omp_in)                                        // expected-error {{redefinition of user-defined reduction for type 'long'}}
+#pragma omp declare reduction(fun1 : long : omp_out += omp_in) initializer                              // expected-error {{expected '(' after 'initializer'}}
+#pragma omp declare reduction(fun2 : long : omp_out += omp_in) initializer {                            // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun3 : long : omp_out += omp_in) initializer[                             // expected-error {{expected '(' after 'initializer'}} expected-error {{expected expression}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun4 : long : omp_out += omp_in) initializer()                            // expected-error {{expected expression}}
+#pragma omp declare reduction(fun5 : long : omp_out += omp_in) initializer(temp)                        // expected-error {{only 'omp_priv' or 'omp_orig' variables are allowed in initializer expression}}
+#pragma omp declare reduction(fun6 : long : omp_out += omp_in) initializer(omp_orig                     // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun7 : long : omp_out += omp_in) initializer(omp_priv Class1 < int > ())  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun77 : long : omp_out += omp_in) initializer(omp_priv Class2 < int > ()) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun8 : long : omp_out += omp_in) initializer(omp_priv 23)                 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp declare reduction(fun88 : long : omp_out += omp_in) initializer(omp_priv 23))               // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{extra tokens at the end of '#pragma omp declare reduction' are ignored}}
+#pragma omp declare reduction(fun9 : long : omp_out += omp_priv) initializer(omp_in = 23)               // expected-error {{use of undeclared identifier 'omp_priv'; did you mean 'omp_in'?}} expected-note {{'omp_in' declared here}}
+#pragma omp declare reduction(fun10 : long : omp_out += omp_in) initializer(omp_priv = 23)
+
+template <typename T>
+T fun(T arg) {
+#pragma omp declare reduction(red : T : omp_out++)
+  {
+#pragma omp declare reduction(red : T : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : T : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'T'}}
+#pragma omp declare reduction(fun : T : omp_out += omp_in) initializer(omp_priv = 23)
+  }
+  return arg;
+}
+
+template <typename T>
+T foo(T arg) {
+  T i;
+  {
+#pragma omp declare reduction(red : T : omp_out++)
+#pragma omp declare reduction(red1 : T : omp_out++)   // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red1 : int : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+  #pragma omp parallel reduction (red : i)
+  {
+  }
+  #pragma omp parallel reduction (red1 : i)
+  {
+  }
+  #pragma omp parallel reduction (red2 : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  }
+  {
+#pragma omp declare reduction(red1 : int : omp_out++) // expected-note {{previous definition is here}}
+#pragma omp declare reduction(red : T : omp_out++)
+#pragma omp declare reduction(red1 : T : omp_out++) // expected-error {{redefinition of user-defined reduction for type 'int'}}
+  #pragma omp parallel reduction (red : i)
+  {
+  }
+  #pragma omp parallel reduction (red1 : i)
+  {
+  }
+  #pragma omp parallel reduction (red2 : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  }
+  return arg;
+}
+
+#pragma omp declare reduction(foo : int : ({int a = omp_in; a = a * 2; omp_out += a; }))
+int main() {
+  Class1<int> c1;
+  int i;
+  #pragma omp parallel reduction (::fun : c1)
+  {
+  }
+  #pragma omp parallel reduction (::Class1<int>::fun : c1)
+  {
+  }
+  #pragma omp parallel reduction (::Class2<int>::fun : i) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  {
+  }
+  return fun(15) + foo(15); // expected-note {{in instantiation of function template specialization 'foo<int>' requested here}}
+}
diff --git a/test/OpenMP/declare_simd_ast_print.c b/test/OpenMP/declare_simd_ast_print.c
new file mode 100644
index 0000000000000..04fd73f272d8b
--- /dev/null
+++ b/test/OpenMP/declare_simd_ast_print.c
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd aligned(b : 64)
+#pragma omp declare simd simdlen(32) aligned(d, b)
+#pragma omp declare simd inbranch, uniform(d) linear(val(s1, s2) : 32)
+#pragma omp declare simd notinbranch simdlen(2), uniform(s1, s2) linear(d: s1)
+void add_1(float *d, int s1, float *s2, double b[]) __attribute__((cold));
+
+// CHECK: #pragma omp declare simd notinbranch simdlen(2) uniform(s1, s2) linear(val(d): s1)
+// CHECK-NEXT: #pragma omp declare simd inbranch uniform(d) linear(val(s1): 32) linear(val(s2): 32)
+// CHECK-NEXT: #pragma omp declare simd simdlen(32) aligned(d) aligned(b)
+// CHECK-NEXT: #pragma omp declare simd aligned(b: 64)
+// CHECK-NEXT: void add_1(float *d, int s1, float *s2, double b[]) __attribute__((cold))
+
+#endif
diff --git a/test/OpenMP/declare_simd_ast_print.cpp b/test/OpenMP/declare_simd_ast_print.cpp
new file mode 100644
index 0000000000000..5a32e61d9922f
--- /dev/null
+++ b/test/OpenMP/declare_simd_ast_print.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd linear(d: 8)
+#pragma omp declare simd inbranch simdlen(32)
+#pragma omp declare simd notinbranch
+void add_1(float *d) __attribute__((cold));
+
+// CHECK: #pragma omp declare simd notinbranch
+// CHECK-NEXT: #pragma omp declare simd inbranch simdlen(32)
+// CHECK-NEXT: #pragma omp declare simd linear(val(d): 8)
+// CHECK-NEXT: void add_1(float *d) __attribute__((cold));
+//
+
+#pragma omp declare simd aligned(hp, hp2)
+template <class C> void h(C *hp, C *hp2, C *hq, C *lin) {
+}
+
+// CHECK: #pragma omp declare simd aligned(hp) aligned(hp2)
+// CHECK-NEXT: template <class C = int> void h(int *hp, int *hp2, int *hq, int *lin) {
+// CHECK-NEXT: h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+// CHECK-NEXT: }
+
+// CHECK: #pragma omp declare simd  aligned(hp) aligned(hp2)
+// CHECK-NEXT: template <class C = float> void h(float *hp, float *hp2, float *hq, float *lin) {
+// CHECK-NEXT: }
+
+// CHECK: #pragma omp declare simd aligned(hp) aligned(hp2)
+// CHECK: template <class C> void h(C *hp, C *hp2, C *hq, C *lin) {
+// CHECK-NEXT: }
+//
+
+// Explicit specialization with <C=int>.
+// Pragmas need to be same, otherwise standard says that's undefined behavior.
+#pragma omp declare simd aligned(hp, hp2)
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin)
+{
+  // Implicit specialization with <C=float>.
+  // This is special case where the directive is stored by Sema and is
+  // generated together with the (pending) function instatiation.
+  h((float*) hp, (float*) hp2, (float*) hq, (float*) lin);
+}
+
+class VV {
+  // CHECK: #pragma omp declare simd uniform(this, a) linear(val(b): a)
+  // CHECK-NEXT: int add(int a, int b) __attribute__((cold))    {
+  // CHECK-NEXT: return a + b;
+  // CHECK-NEXT: }
+  #pragma omp declare simd uniform(this, a) linear(val(b): a)
+  int add(int a, int b) __attribute__((cold)) { return a + b; }
+
+  // CHECK: #pragma omp declare simd aligned(b: 4) aligned(a) linear(ref(b): 4) linear(val(this)) linear(val(a))
+  // CHECK-NEXT: float taddpf(float *a, float *&b)     {
+  // CHECK-NEXT: return *a + *b;
+  // CHECK-NEXT: }
+  #pragma omp declare simd aligned (b: 4) aligned(a) linear(ref(b): 4) linear(this, a)
+  float taddpf(float *a, float *&b) { return *a + *b; }
+
+// CHECK: #pragma omp declare simd aligned(b: 8)
+// CHECK-NEXT: #pragma omp declare simd linear(uval(c): 8)
+// CHECK-NEXT: int tadd(int (&b)[], int &c) {
+// CHECK-NEXT: return this->x[b[0]] + b[0];
+// CHECK-NEXT: }
+  #pragma omp declare simd linear(uval(c): 8)
+  #pragma omp declare simd aligned(b : 8)
+  int tadd(int (&b)[], int &c) { return x[b[0]] + b[0]; }
+
+private:
+  int x[10];
+};
+
+// CHECK: template <int X = 16, typename T = float> class TVV {
+// CHECK: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int a, int b);
+// CHECK: #pragma omp declare simd aligned(a: 16 * 2) aligned(b) linear(ref(b): 16)
+// CHECK-NEXT: float taddpf(float *a, float *&b) {
+// CHECK-NEXT: return *a + *b;
+// CHECK-NEXT: }
+// CHECK: #pragma omp declare simd
+// CHECK-NEXT: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int b) {
+// CHECK-NEXT: return this->x[b] + b;
+// CHECK-NEXT: }
+// CHECK: }
+template <int X, typename T>
+class TVV {
+public:
+// CHECK: template <int X, typename T> class TVV {
+  #pragma omp declare simd simdlen(X)
+  int tadd(int a, int b) { return a + b; }
+
+// CHECK: #pragma omp declare simd simdlen(X)
+// CHECK-NEXT: int tadd(int a, int b) {
+// CHECK-NEXT: return a + b;
+// CHECK-NEXT: }
+
+  #pragma omp declare simd aligned(a : X * 2) aligned(b) linear(ref(b): X)
+  float taddpf(float *a, T *&b) { return *a + *b; }
+
+// CHECK: #pragma omp declare simd aligned(a: X * 2) aligned(b)
+// CHECK-NEXT: float taddpf(float *a, T *&b) {
+// CHECK-NEXT: return *a + *b;
+// CHECK-NEXT: }
+
+  #pragma omp declare simd
+  #pragma omp declare simd uniform(this, b)
+  int tadd(int b) { return x[b] + b; }
+
+// CHECK: #pragma omp declare simd uniform(this, b)
+// CHECK-NEXT: #pragma omp declare simd
+// CHECK-NEXT: int tadd(int b) {
+// CHECK-NEXT: return this->x[b] + b;
+// CHECK-NEXT: }
+
+private:
+  int x[X];
+};
+// CHECK: };
+
+// CHECK: #pragma omp declare simd simdlen(64) aligned(b: 64 * 2) linear(uval(c): 64)
+// CHECK: template <int N = 64> void foo(int (&b)[64], float *&c)
+// CHECK: #pragma omp declare simd simdlen(N) aligned(b: N * 2) linear(uval(c): N)
+// CHECK: template <int N> void foo(int (&b)[N], float *&c)
+#pragma omp declare simd simdlen(N) aligned(b : N * 2) linear(uval(c): N)
+template <int N>
+void foo(int (&b)[N], float *&c);
+
+// CHECK: TVV<16, float> t16;
+TVV<16, float> t16;
+
+void f() {
+  float a = 1.0f, b = 2.0f;
+  float *p = &b;
+  float r = t16.taddpf(&a, p);
+  int res = t16.tadd(b);
+  int c[64];
+  foo(c, p);
+}
+
+#endif
diff --git a/test/OpenMP/declare_simd_codegen.cpp b/test/OpenMP/declare_simd_codegen.cpp
new file mode 100644
index 0000000000000..4ed7fb2801b8f
--- /dev/null
+++ b/test/OpenMP/declare_simd_codegen.cpp
@@ -0,0 +1,288 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare simd linear(d : 8)
+#pragma omp declare simd inbranch simdlen(32)
+#pragma omp declare simd notinbranch
+void add_1(float *d) {}
+
+#pragma omp declare simd aligned(hp, hp2)
+template <class C>
+void h(C *hp, C *hp2, C *hq, C *lin) {
+}
+
+// Explicit specialization with <C=int>.
+// Pragmas need to be same, otherwise standard says that's undefined behavior.
+#pragma omp declare simd aligned(hp, hp2)
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin) {
+  // Implicit specialization with <C=float>.
+  // This is special case where the directive is stored by Sema and is
+  // generated together with the (pending) function instatiation.
+  h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+}
+
+class VV {
+public:
+#pragma omp declare simd uniform(this, a) linear(val(b) : a)
+  int add(int a, int b) __attribute__((cold)) { return a + b; }
+
+#pragma omp declare simd aligned(b : 4) aligned(a) linear(ref(b) : 4) linear(this, a)
+  float taddpf(float *a, float *&b) { return *a + *b; }
+
+#pragma omp declare simd linear(uval(c) : 8)
+#pragma omp declare simd aligned(b : 8)
+  int tadd(int (&b)[], int &c) { return x[b[0]] + b[0]; }
+
+private:
+  int x[10];
+} vv;
+
+template <int X, typename T>
+class TVV {
+public:
+#pragma omp declare simd simdlen(X)
+  int tadd(int a, int b) { return a + b; }
+
+#pragma omp declare simd aligned(a : X * 2) aligned(b) linear(ref(b) : X)
+  float taddpf(float *a, T *&b) { return *a + *b; }
+
+#pragma omp declare simd
+#pragma omp declare simd uniform(this, b)
+  int tadd(int b) { return x[b] + b; }
+
+private:
+  int x[X];
+};
+
+#pragma omp declare simd simdlen(N) aligned(b : N * 2) linear(uval(c) : N)
+template <int N>
+void foo(int (&b)[N], float *&c) {}
+
+TVV<16, float> t16;
+
+void f(int (&g)[]) {
+  float a = 1.0f, b = 2.0f;
+  float *p = &b;
+  float r = t16.taddpf(&a, p);
+  int res = t16.tadd(b);
+  int c[64];
+  vv.add(res, res);
+  vv.taddpf(p, p);
+  vv.tadd(g, res);
+  foo(c, p);
+}
+
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a : 32)
+int bar(VV v, float *a) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a)
+float baz(VV v, int a[]) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd notinbranch aligned(a)
+double bay(VV v, double *&a) { return 0; }
+#pragma omp declare simd
+#pragma omp declare simd inbranch linear(a : b) uniform(v, b)
+void bax(VV v, double *a, int b) {}
+#pragma omp declare simd uniform(q) aligned(q : 16) linear(k : 1)
+float foo(float *q, float x, int k) { return 0; }
+#pragma omp declare simd notinbranch
+double foo(double x) { return 0; }
+
+// CHECK-DAG: define {{.+}}@_Z5add_1Pf(
+// CHECK-DAG: define {{.+}}@_Z1hIiEvPT_S1_S1_S1_(
+// CHECK-DAG: define {{.+}}@_Z1hIfEvPT_S1_S1_S1_(
+// CHECK-DAG: define {{.+}}@_ZN2VV3addEii(
+// CHECK-DAG: define {{.+}}@_ZN2VV6taddpfEPfRS0_(
+// CHECK-DAG: define {{.+}}@_ZN2VV4taddERA_iRi(
+// CHECK-DAG: define {{.+}}@_Z1fRA_i(
+// CHECK-DAG: define {{.+}}@_ZN3TVVILi16EfE6taddpfEPfRS1_(
+// CHECK-DAG: define {{.+}}@_ZN3TVVILi16EfE4taddEi(
+// CHECK-DAG: define {{.+}}@_Z3fooILi64EEvRAT__iRPf(
+// CHECK-DAG: define {{.+}}@_Z3bar2VVPf(
+// CHECK-DAG: define {{.+}}@_Z3baz2VVPi(
+// CHECK-DAG: define {{.+}}@_Z3bay2VVRPd(
+// CHECK-DAG: define {{.+}}@_Z3bax2VVPdi(
+// CHECK-DAG: define {{.+}}@_Z3fooPffi(
+// CHECK-DAG: define {{.+}}@_Z3food(
+
+// CHECK-DAG: "_ZGVbM4l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbN4l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcM8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcN8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdM8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdN8l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeM16l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeN16l8__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeM32v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVbN2v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVcN4v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVdN4v__Z5add_1Pf"
+// CHECK-DAG: "_ZGVeN8v__Z5add_1Pf"
+
+// CHECK-DAG: "_ZGVbM2va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVbN2va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcM4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcN4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdM4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdN4va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeM8va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeN8va16va16vv__Z1hIiEvPT_S1_S1_S1_"
+
+// CHECK-DAG: "_ZGVbM2va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVbN2va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcM4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVcN4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdM4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVdN4va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeM8va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+// CHECK-DAG: "_ZGVeN8va16va16vv__Z1hIfEvPT_S1_S1_S1_"
+
+// CHECK-DAG: "_ZGVbM4uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVbN4uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVcM8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVcN8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVdM8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVdN8uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVeM16uus1__ZN2VV3addEii"
+// CHECK-DAG: "_ZGVeN16uus1__ZN2VV3addEii"
+
+// CHECK-DAG: "_ZGVbM4lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVbN4lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVcM8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVcN8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVdM8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVdN8lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVeM16lla16l4a4__ZN2VV6taddpfEPfRS0_"
+// CHECK-DAG: "_ZGVeN16lla16l4a4__ZN2VV6taddpfEPfRS0_"
+
+// CHECK-DAG: "_ZGVbM4vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbN4vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcM8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcN8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdM8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdN8vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeM16vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeN16vvl8__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbM4vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVbN4vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcM8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVcN8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdM8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVdN8vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeM16vva8v__ZN2VV4taddERA_iRi"
+// CHECK-DAG: "_ZGVeN16vva8v__ZN2VV4taddERA_iRi"
+
+// CHECK-DAG: "_ZGVbM4vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVbN4vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVcM8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVcN8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVdM8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVdN8vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVeM16vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+// CHECK-DAG: "_ZGVeN16vva32l16a16__ZN3TVVILi16EfE6taddpfEPfRS1_"
+
+// CHECK-DAG: "_ZGVbM4uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbN4uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcM8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcN8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdM8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdN8uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeM16uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeN16uu__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbM4vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVbN4vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcM8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVcN8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdM8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVdN8vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeM16vv__ZN3TVVILi16EfE4taddEi"
+// CHECK-DAG: "_ZGVeN16vv__ZN3TVVILi16EfE4taddEi"
+
+// CHECK-DAG: "_ZGVbM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVbN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVcM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVcN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVdM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVdN64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVeM64va128l64__Z3fooILi64EEvRAT__iRPf"
+// CHECK-DAG: "_ZGVeN64va128l64__Z3fooILi64EEvRAT__iRPf"
+
+// CHECK-DAG: "_ZGVbM4vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVbN4vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcM8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcN8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdM8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdN8vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeM16vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeN16vv__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVbN4vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVcN8vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVdN8vva32__Z3bar2VVPf"
+// CHECK-DAG: "_ZGVeN16vva32__Z3bar2VVPf"
+
+// CHECK-DAG: "_ZGVbM4vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVbN4vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcM8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcN8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdM8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdN8vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeM16vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeN16vv__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVbN4vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVcN8vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVdN8vva16__Z3baz2VVPi"
+// CHECK-DAG: "_ZGVeN16vva16__Z3baz2VVPi"
+
+// CHECK-DAG: "_ZGVbM2vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVbN2vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcM4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcN4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdM4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdN4vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeM8vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeN8vv__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVbN2vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVcN4vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVdN4vva16__Z3bay2VVRPd"
+// CHECK-DAG: "_ZGVeN8vva16__Z3bay2VVRPd"
+
+// CHECK-DAG: "_ZGVbM4us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcM8us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdM8us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeM16us2u__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVbM4vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVbN4vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcM8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVcN8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdM8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVdN8vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeM16vvv__Z3bax2VVPdi"
+// CHECK-DAG: "_ZGVeN16vvv__Z3bax2VVPdi"
+
+// CHECK-DAG: "_ZGVbM4ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVbN4ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVcM8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVcN8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVdM8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVdN8ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVeM16ua16vl1__Z3fooPffi"
+// CHECK-DAG: "_ZGVeN16ua16vl1__Z3fooPffi"
+
+// CHECK-DAG: "_ZGVbN2v__Z3food"
+// CHECK-DAG: "_ZGVcN4v__Z3food"
+// CHECK-DAG: "_ZGVdN4v__Z3food"
+// CHECK-DAG: "_ZGVeN8v__Z3food"
+
+// CHECK-NOT: "_ZGV{{.+}}__Z1fRA_i
+
+#endif
diff --git a/test/OpenMP/declare_simd_messages.cpp b/test/OpenMP/declare_simd_messages.cpp
new file mode 100644
index 0000000000000..15971eb14de5a
--- /dev/null
+++ b/test/OpenMP/declare_simd_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -triple=x86_64-pc-win32 -verify -fopenmp -x c++ -std=c++11 -fms-extensions %s
+
+// expected-error@+1 {{expected an OpenMP directive}}
+#pragma omp declare
+
+// expected-error@+2 {{'#pragma omp declare simd' can only be applied to functions}}
+#pragma omp declare simd
+int a;
+// expected-error@+2 {{'#pragma omp declare simd' can only be applied to functions}}
+#pragma omp declare simd
+#pragma omp threadprivate(a)
+int var;
+#pragma omp threadprivate(var)
+
+// expected-error@+2 {{expected an OpenMP directive}} expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare
+
+// expected-error@+3 {{function declaration is expected after 'declare simd' directive}}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare simd
+#pragma options align=packed
+int main();
+
+// expected-error@+3 {{function declaration is expected after 'declare simd' directive}}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+#pragma omp declare simd
+#pragma init_seg(compiler)
+int main();
+
+// expected-error@+1 {{single declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+// expected-note@+1 {{declared here}}
+int b, c;
+
+// expected-error@+1 {{'C' does not refer to a value}}
+#pragma omp declare simd simdlen(C)
+// expected-note@+1 {{declared here}}
+template <class C>
+void h(C *hp, C *hp2, C *hq, C *lin) {
+  b = 0;
+}
+
+#pragma omp declare simd
+template <>
+void h(int *hp, int *hp2, int *hq, int *lin) {
+  h((float *)hp, (float *)hp2, (float *)hq, (float *)lin);
+}
+
+#pragma omp declare simd inbranch inbranch
+#pragma omp declare simd notinbranch notinbranch
+#pragma omp declare simd inbranch inbranch notinbranch // expected-error {{unexpected 'notinbranch' clause, 'inbranch' is specified already}}
+#pragma omp declare simd notinbranch notinbranch inbranch // expected-error {{unexpected 'inbranch' clause, 'notinbranch' is specified already}}
+// expected-note@+2 {{read of non-const variable 'b' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp declare simd simdlen(b)
+// expected-error@+1 {{directive '#pragma omp declare simd' cannot contain more than one 'simdlen' clause}}
+#pragma omp declare simd simdlen(32) simdlen(c)
+// expected-error@+1 {{expected '(' after 'simdlen'}}
+#pragma omp declare simd simdlen
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen(
+// expected-error@+2 {{expected '(' after 'simdlen'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen(), simdlen
+// expected-error@+1 2 {{expected expression}}
+#pragma omp declare simd simdlen(), simdlen()
+// expected-warning@+3 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+2 {{expected '(' after 'simdlen'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd simdlen() simdlen)
+void foo();
+
+// expected-error@+3 4 {{expected reference to one of the parameters of function 'foo'}}
+// expected-error@+2 {{invalid use of 'this' outside of a non-static member function}}
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp declare simd simdlen(N) uniform(this, var) aligned(var)
+template<int N>
+void foo() {}
+
+void test() {
+  // expected-note@+1 {{in instantiation of function template specialization 'foo<-3>' requested here}}
+  foo<-3>();
+}
+
+// expected-error@+1 {{expected '(' after 'uniform'}}
+#pragma omp declare simd uniform
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd uniform(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd uniform(this,a
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd uniform(,a)
+// expected-error@+1 {{expected '(' after 'aligned'}}
+#pragma omp declare simd aligned
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(a:
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(a:)
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+1 {{expected '(' after 'aligned'}}
+#pragma omp declare simd aligned :)
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd aligned(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd aligned(this,b
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd aligned(, b)
+// expected-note@+4 {{defined as aligned}}
+// expected-error@+3 {{a parameter cannot appear in more than one aligned clause}}
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ',' or ')' in 'aligned' clause}}
+#pragma omp declare simd aligned(b) aligned(b ; 64)
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a parameter cannot appear in more than one aligned clause}}
+#pragma omp declare simd aligned(b) aligned(b: 64)
+// expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+#pragma omp declare simd aligned(b: -1)
+// expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+#pragma omp declare simd aligned(b: 3)
+// expected-error@+1 {{expected '(' after 'linear'}}
+#pragma omp declare simd linear
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear()
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(a:
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(a:)
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp declare simd' are ignored}}
+// expected-error@+1 {{expected '(' after 'linear'}}
+#pragma omp declare simd linear :)
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd linear(this
+// expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected ')'}}
+// expected-error@+1 {{invalid use of 'this' outside of a non-static member function}}
+#pragma omp declare simd linear(this,b
+// expected-error@+1 {{expected expression}}
+#pragma omp declare simd linear(, b)
+// expected-note@+4 {{defined as linear}}
+// expected-error@+3 {{linear variable cannot be linear}}
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ',' or ')' in 'linear' clause}}
+#pragma omp declare simd linear(b) linear(b ; 64)
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp declare simd linear(b) linear(b: 64)
+#pragma omp declare simd linear(b: -1)
+#pragma omp declare simd linear(b: 3)
+// expected-error@+1 {{expected a reference to a parameter specified in a 'uniform' clause}}
+#pragma omp declare simd linear(b: a)
+// expected-note@+2 {{defined as uniform}}
+// expected-error@+1 {{linear variable cannot be uniform}}
+#pragma omp declare simd uniform(a), linear(a: 4)
+// expected-note@+2 {{defined as uniform}}
+// expected-error@+1 {{linear variable cannot be uniform}}
+#pragma omp declare simd linear(a: 4) uniform(a)
+// expected-error@+1 {{variable of non-reference type 'int *' can be used only with 'val' modifier, but used with 'uval'}}
+#pragma omp declare simd linear(uval(b))
+// expected-error@+1 {{variable of non-reference type 'int *' can be used only with 'val' modifier, but used with 'ref'}}
+#pragma omp declare simd linear(ref(b))
+// expected-error@+1 {{expected one of 'ref', val' or 'uval' modifiers}}
+#pragma omp declare simd linear(uref(b))
+void bar(int a, int *b);
+
+template <class T>
+struct St {
+// expected-error@+2 {{function declaration is expected after 'declare simd' directive}}
+#pragma init_seg(compiler)
+#pragma omp declare simd
+#pragma init_seg(compiler)
+// expected-note@+7 {{defined as uniform}}
+// expected-error@+6 {{expected a reference to a parameter specified in a 'uniform' clause}}
+// expected-error@+5 {{linear variable cannot be uniform}}
+// expected-note@+4 {{defined as aligned}}
+// expected-error@+3 {{argument to 'aligned' clause must be a strictly positive integer value}}
+// expected-error@+2 {{'this' cannot appear in more than one aligned clause}}
+// expected-error@+1 {{use of undeclared identifier 't'}}
+#pragma omp declare simd uniform(this, t) aligned(this: 4) aligned(this: -4) linear(this: hp)
+  void h(T *hp) {
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp declare simd'}}
+#pragma omp declare simd
+    *hp = *t;
+  }
+
+private:
+  T t;
+};
+
+namespace N {
+  // expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+  #pragma omp declare simd
+}
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
+// expected-error@+1 {{function declaration is expected after 'declare simd' directive}}
+#pragma omp declare simd
diff --git a/test/OpenMP/declare_target_ast_print.cpp b/test/OpenMP/declare_target_ast_print.cpp
new file mode 100644
index 0000000000000..78a9cf634af89
--- /dev/null
+++ b/test/OpenMP/declare_target_ast_print.cpp
@@ -0,0 +1,138 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo() {}
+// CHECK-NEXT: void foo()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+
+extern "C" {
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo_c() {}
+// CHECK-NEXT: void foo_c()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+}
+
+extern "C++" {
+#pragma omp declare target
+// CHECK: #pragma omp declare target
+void foo_cpp() {}
+// CHECK-NEXT: void foo_cpp()
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+}
+
+#pragma omp declare target
+template <class T>
+struct C {
+// CHECK: template <class T = int> struct C
+  T t;
+// CHECK-NEXT: int t;
+  static T ts;
+// CHECK-NEXT: #pragma omp declare target
+// CHECK-NEXT: static int ts;
+// CHECK: #pragma omp end declare target
+
+  C(T t) : t(t) {
+  }
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: C(int t) : t(t) {
+// CHECK-NEXT: }
+// CHECK: #pragma omp end declare target
+
+  T foo() {
+    return t;
+  }
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: int foo() {
+// CHECK-NEXT: return this->t;
+// CHECK-NEXT: }
+// CHECK: #pragma omp end declare target
+};
+
+// CHECK: template <class T> struct C {
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: static T ts;
+// CHECK-NEXT: #pragma omp end declare target
+
+template<class T>
+T C<T>::ts = 1;
+// CHECK: #pragma omp declare target
+// CHECK: T ts = 1;
+// CHECK: #pragma omp end declare target
+
+// CHECK: #pragma omp declare target
+// CHECK: int test1()
+int test1() {
+  C<int> c(1);
+  return c.foo() + c.ts;
+}
+#pragma omp end declare target
+// CHECK: #pragma omp end declare target
+
+int a1;
+void f1() {
+}
+#pragma omp declare target (a1, f1)
+// CHECK: #pragma omp declare target
+// CHECK: int a1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: void f1()
+// CHECK: #pragma omp end declare target
+
+int b1, b2, b3;
+void f2() {
+}
+#pragma omp declare target to(b1) to(b2), to(b3, f2)
+// CHECK: #pragma omp declare target
+// CHECK: int b1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: int b2;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: int b3;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target
+// CHECK: void f2()
+// CHECK: #pragma omp end declare target
+
+int c1, c2, c3;
+void f3() {
+}
+#pragma omp declare target link(c1) link(c2), link(c3, f3)
+// CHECK: #pragma omp declare target link
+// CHECK: int c1;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: int c2;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: int c3;
+// CHECK: #pragma omp end declare target
+// CHECK: #pragma omp declare target link
+// CHECK: void f3()
+// CHECK: #pragma omp end declare target
+
+int main (int argc, char **argv) {
+  foo();
+  foo_c();
+  foo_cpp();
+  test1();
+  return (0);
+}
+
+// CHECK: #pragma omp declare target
+// CHECK-NEXT: int ts = 1;
+// CHECK-NEXT: #pragma omp end declare target
+#endif
diff --git a/test/OpenMP/declare_target_messages.cpp b/test/OpenMP/declare_target_messages.cpp
new file mode 100644
index 0000000000000..b858d53c1ecf5
--- /dev/null
+++ b/test/OpenMP/declare_target_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -fnoopenmp-use-tls -ferror-limit 100 -o - %s
+
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+int a, b; // expected-warning {{declaration is not declared in any declare target region}}
+__thread int t; // expected-note {{defined as threadprivate or thread local}}
+
+#pragma omp declare target . // expected-error {{expected '(' after 'declare target'}}
+
+#pragma omp declare target
+void f();
+#pragma omp end declare target shared(a) // expected-warning {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
+
+#pragma omp declare target map(a) // expected-error {{unexpected 'map' clause, only 'to' or 'link' clauses expected}}
+
+void c(); // expected-warning {{declaration is not declared in any declare target region}}
+
+extern int b;
+
+struct NonT {
+  int a;
+};
+
+typedef int sint;
+
+#pragma omp declare target // expected-note {{to match this '#pragma omp declare target'}}
+#pragma omp threadprivate(a) // expected-note {{defined as threadprivate or thread local}}
+extern int b;
+int g;
+
+struct T { // expected-note {{mappable type cannot be polymorphic}}
+  int a;
+  virtual int method();
+};
+
+class VC { // expected-note {{mappable type cannot be polymorphic}}
+  T member;
+  NonT member1;
+  public:
+    virtual int method() { T a; return 0; } // expected-error {{type 'T' is not mappable to target}}
+};
+
+struct C {
+  NonT a;
+  sint b;
+  int method();
+  int method1();
+};
+
+int C::method1() {
+  return 0;
+}
+
+void foo() {
+  a = 0; // expected-error {{threadprivate variables cannot be used in target constructs}}
+  b = 0; // expected-note {{used here}}
+  t = 1; // expected-error {{threadprivate variables cannot be used in target constructs}}
+  C object;
+  VC object1; // expected-error {{type 'VC' is not mappable to target}}
+  g = object.method();
+  g += object.method1();
+  g += object1.method();
+  f();
+  c(); // expected-note {{used here}}
+}
+#pragma omp declare target // expected-error {{expected '#pragma omp end declare target'}}
+void foo1() {}
+#pragma omp end declare target
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+int C::method() {
+  return 0;
+}
+
+struct S {
+#pragma omp declare target // expected-error {{directive must be at file or namespace scope}}
+  int v;
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+};
+
+int main (int argc, char **argv) {
+#pragma omp declare target // expected-error {{unexpected OpenMP directive '#pragma omp declare target'}}
+  int v;
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+  foo();
+  return (0);
+}
+
+namespace {
+#pragma omp declare target // expected-note {{to match this '#pragma omp declare target'}}
+  int x;
+} //  expected-error {{expected '#pragma omp end declare target'}}
+#pragma omp end declare target // expected-error {{unexpected OpenMP directive '#pragma omp end declare target'}}
+
+#pragma omp declare target link(S) // expected-error {{'S' used in declare target directive is not a variable or a function name}}
+
+#pragma omp declare target (x, x) // expected-error {{'x' appears multiple times in clauses on the same declare target directive}}
+#pragma omp declare target to(x) to(x) // expected-error {{'x' appears multiple times in clauses on the same declare target directive}}
+#pragma omp declare target link(x) // expected-error {{'x' must not appear in both clauses 'to' and 'link'}}
+
+#pragma omp declare target // expected-error {{expected '#pragma omp end declare target'}} expected-note {{to match this '#pragma omp declare target'}}
diff --git a/test/OpenMP/distribute_ast_print.cpp b/test/OpenMP/distribute_ast_print.cpp
index c3a175a5e5e84..5748fc79ed67f 100644
--- a/test/OpenMP/distribute_ast_print.cpp
+++ b/test/OpenMP/distribute_ast_print.cpp
@@ -8,6 +8,75 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a) private(S7<S>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute private(this->a) private(this->a)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
diff --git a/test/OpenMP/distribute_codegen.cpp b/test/OpenMP/distribute_codegen.cpp
new file mode 100644
index 0000000000000..37f00f09178b0
--- /dev/null
+++ b/test/OpenMP/distribute_codegen.cpp
@@ -0,0 +1,263 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64  --check-prefix HCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32  --check-prefix HCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix HCHECK
+
+// Test target codegen - host bc file has to be created first. (no significant differences with host version of target region)
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CHECK-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}without_schedule_clause{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void without_schedule_clause(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for (int i = 33; i < 32000000; i += 7) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_not_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_not_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute dist_schedule(static)
+  for (int i = 32000000; i > 33; i += -7) {
+        a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 4571423, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 92, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp sgt i32 [[UBV0]], 4571423
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 4571423, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp sle i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add nsw i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+
+// CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
+void static_chunked(float *a, float *b, float *c, float *d) {
+  #pragma omp target
+  #pragma omp teams
+#pragma omp distribute dist_schedule(static, 5)
+  for (unsigned i = 131071; i <= 2147483647; i += 127) {
+    a[i] = b[i] * c[i] * d[i];
+  }
+}
+
+// CHECK: define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], float** dereferenceable({{[0-9]+}}) [[APTR:%.+]], float** dereferenceable({{[0-9]+}}) [[BPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[CPTR:%.+]], float** dereferenceable({{[0-9]+}}) [[DPTR:%.+]])
+// CHECK:  [[TID_ADDR:%.+]] = alloca i32*
+// CHECK:  [[IV:%.+iv]] = alloca i32
+// CHECK:  [[LB:%.+lb]] = alloca i32
+// CHECK:  [[UB:%.+ub]] = alloca i32
+// CHECK:  [[ST:%.+stride]] = alloca i32
+// CHECK:  [[LAST:%.+last]] = alloca i32
+// CHECK-DAG:  store i32* [[GBL_TIDP]], i32** [[TID_ADDR]]
+// CHECK-DAG:  store i32 0, i32* [[LB]]
+// CHECK-DAG:  store i32 16908288, i32* [[UB]]
+// CHECK-DAG:  store i32 1, i32* [[ST]]
+// CHECK-DAG:  store i32 0, i32* [[LAST]]
+// CHECK-DAG:  [[GBL_TID:%.+]] = load i32*, i32** [[TID_ADDR]]
+// CHECK-DAG:  [[GBL_TIDV:%.+]] = load i32, i32* [[GBL_TID]]
+// CHECK:  call void @__kmpc_for_static_init_{{.+}}(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]], i32 91, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 5)
+// CHECK-DAG:  [[UBV0:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  [[USWITCH:%.+]] = icmp ugt i32 [[UBV0]], 16908288
+// CHECK:  br i1 [[USWITCH]], label %[[BBCT:.+]], label %[[BBCF:.+]]
+// CHECK-DAG:  [[BBCT]]:
+// CHECK-DAG:  br label %[[BBCE:.+]]
+// CHECK-DAG:  [[BBCF]]:
+// CHECK-DAG:  [[UBV1:%.+]] = load i32, i32* [[UB]]
+// CHECK-DAG:  br label %[[BBCE]]
+// CHECK:  [[BBCE]]:
+// CHECK:  [[SELUB:%.+]] = phi i32 [ 16908288, %[[BBCT]] ], [ [[UBV1]], %[[BBCF]] ]
+// CHECK:  store i32 [[SELUB]], i32* [[UB]]
+// CHECK:  [[LBV0:%.+]] = load i32, i32* [[LB]]
+// CHECK:  store i32 [[LBV0]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR:.+]]
+// CHECK:  [[BBINNFOR]]:
+// CHECK:  [[IVVAL0:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[UBV2:%.+]] = load i32, i32* [[UB]]
+// CHECK:  [[IVLEUB:%.+]] = icmp ule i32 [[IVVAL0]], [[UBV2]]
+// CHECK:  br i1 [[IVLEUB]], label %[[BBINNBODY:.+]], label %[[BBINNEND:.+]]
+// CHECK:  [[BBINNBODY]]:
+// CHECK:  {{.+}} = load i32, i32* [[IV]]
+// ... loop body ...
+// CHECK:  br label %[[BBBODYCONT:.+]]
+// CHECK:  [[BBBODYCONT]]:
+// CHECK:  br label %[[BBINNINC:.+]]
+// CHECK:  [[BBINNINC]]:
+// CHECK:  [[IVVAL1:%.+]] = load i32, i32* [[IV]]
+// CHECK:  [[IVINC:%.+]] = add i32 [[IVVAL1]], 1
+// CHECK:  store i32 [[IVINC]], i32* [[IV]]
+// CHECK:  br label %[[BBINNFOR]]
+// CHECK:  [[BBINNEND]]:
+// CHECK:  br label %[[LPEXIT:.+]]
+// CHECK:  [[LPEXIT]]:
+// CHECK:  call void @__kmpc_for_static_fini(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TIDV]])
+// CHECK:  ret void
+
+// CHECK-LABEL: test_precond
+void test_precond() {
+  char a = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute
+  for(char i = a; i < 10; ++i);
+}
+
+// a is passed as a parameter to the outlined functions
+// CHECK:  define {{.*}}void @.omp_outlined.{{.*}}(i32* noalias [[GBL_TIDP:%.+]], i32* noalias [[BND_TID:%.+]], i8* dereferenceable({{[0-9]+}}) [[APARM:%.+]])
+// CHECK:  store i8* [[APARM]], i8** [[APTRADDR:%.+]]
+// ..many loads of %0..
+// CHECK:  [[A2:%.+]] = load i8*, i8** [[APTRADDR]]
+// CHECK:  [[AVAL0:%.+]] = load i8, i8* [[A2]]
+// CHECK:  store i8 [[AVAL0]], i8* [[CAP_EXPR:%.+]],
+// CHECK:  [[AVAL1:%.+]] = load i8, i8* [[CAP_EXPR]]
+// CHECK:  load i8, i8* [[CAP_EXPR]]
+// CHECK:  [[AVAL2:%.+]] = load i8, i8* [[CAP_EXPR]]
+// CHECK:  [[ACONV:%.+]] = sext i8 [[AVAL2]] to i32
+// CHECK:  [[ACMP:%.+]] = icmp slt i32 [[ACONV]], 10
+// CHECK:  br i1 [[ACMP]], label %[[PRECOND_THEN:.+]], label %[[PRECOND_END:.+]]
+// CHECK:  [[PRECOND_THEN]]
+// CHECK:  call void @__kmpc_for_static_init_4
+// CHECK:  call void @__kmpc_for_static_fini
+// CHECK:  [[PRECOND_END]]
+
+// no templates for now, as these require special handling in target regions and/or declare target
+
+// HCHECK-LABEL: fint
+// HCHECK: call {{.*}}i32 {{.+}}ftemplate
+// HCHECK: ret i32
+
+// HCHECK: load i16, i16*
+// HCHECK: store i16 %
+// HCHECK: call i32 @__tgt_target_teams(
+// HCHECK: call void @__kmpc_for_static_init_4(
+template <typename T>
+T ftemplate() {
+  short aa = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static, aa)
+  for (int i = 0; i < 100; i++) {
+  }
+  return T();
+}
+
+int fint(void) { return ftemplate<int>(); }
+
+#endif
diff --git a/test/OpenMP/distribute_dist_schedule_ast_print.cpp b/test/OpenMP/distribute_dist_schedule_ast_print.cpp
new file mode 100644
index 0000000000000..32389ef1496e1
--- /dev/null
+++ b/test/OpenMP/distribute_dist_schedule_ast_print.cpp
@@ -0,0 +1,106 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  static T a;
+// CHECK: static T a;
+#pragma omp distribute dist_schedule(static,10)
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, 10)
+  for (int i=0; i < 2; ++i)a=2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp distribute dist_schedule(static,a)
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, a)
+  for (int i=0; i < 2; ++i)a=2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static,2)
+  for (int i = 0; i < 10; ++i)
+  for (int j = 0; j < 10; ++j)foo();
+// CHECK-NEXT: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, 2)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static,a)
+  for (int i = 0; i < 10; ++i)
+  for (int j = 0; j < 10; ++j)foo();
+// CHECK-NEXT: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, a)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+  for (int i = 0; i < 10; ++i)foo();
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+#pragma omp distribute
+// CHECK: #pragma omp distribute
+  for (int i = 0; i < 10; ++i)foo();
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();  
+  return T();
+}
+
+int main (int argc, char **argv) {
+  int b = argc, c, d, e, f, g;
+  static int a;
+// CHECK: static int a;
+#pragma omp distribute dist_schedule(static,10)
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, 10)
+  for (int i=0; i < 2; ++i)a=2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp distribute dist_schedule(static,b)
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, b)
+  for (int i=0; i < 2; ++i)a=2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static,2)
+  for (int i = 0; i < 10; ++i)
+  for (int j = 0; j < 10; ++j)foo();
+// CHECK-NEXT: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, 2)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute dist_schedule(static,a)
+  for (int i = 0; i < 10; ++i)
+  for (int j = 0; j < 10; ++j)foo();
+// CHECK-NEXT: #pragma omp target
+// CHECK-NEXT: #pragma omp teams
+// CHECK-NEXT: #pragma omp distribute dist_schedule(static, a)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: foo();
+  for (int i = 0; i < 10; ++i)foo();
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+#pragma omp distribute
+// CHECK: #pragma omp distribute
+  for (int i = 0; i < 10; ++i)foo();
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_dist_schedule_messages.cpp b/test/OpenMP/distribute_dist_schedule_messages.cpp
new file mode 100644
index 0000000000000..98652a20034ea
--- /dev/null
+++ b/test/OpenMP/distribute_dist_schedule_messages.cpp
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+  #pragma omp distribute dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+  #pragma omp distribute dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+  #pragma omp distribute dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_ast_print.cpp b/test/OpenMP/distribute_parallel_for_ast_print.cpp
new file mode 100644
index 0000000000000..993cc2a7151f7
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_ast_print.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute parallel for private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule(static, a) schedule(dynamic) default(none) copyin(g) firstprivate(a)
+  // CHECK: #pragma omp distribute parallel for dist_schedule(static, a) schedule(dynamic) default(none) copyin(g)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+	    a++;
+  // CHECK: #pragma omp distribute parallel for private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h) dist_schedule(static, N)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  // CHECK: #pragma omp distribute parallel for schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) if (argc) num_threads(a) default(shared) shared(e) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      a++;
+  // CHECK: #pragma omp distribute parallel for private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) if(argc) num_threads(a) default(shared) shared(e) reduction(+: h) dist_schedule(static, b)
+ // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: a++;
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_parallel_for_collapse_messages.cpp b/test/OpenMP/distribute_parallel_for_collapse_messages.cpp
new file mode 100644
index 0000000000000..41976a69ee2c9
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute parallel for collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute parallel for', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_copyin_messages.cpp b/test/OpenMP/distribute_parallel_for_copyin_messages.cpp
new file mode 100644
index 0000000000000..7d703413488df
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_copyin_messages.cpp
@@ -0,0 +1,190 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2 &operator=(S2 &s2) { return *this; }
+};
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+  S3 &operator=(S3 &s3) { return *this; }
+};
+class S4 {
+  int a;
+  S4();
+  S4 &operator=(const S4 &s4); // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+template <class T>
+class ST {
+public:
+  static T s;
+};
+
+S2 k;
+S3 h;
+S4 l(3);
+S5 m(4);
+#pragma omp threadprivate(h, k, l, m)
+
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(l) // expected-error 2 {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(m) // expected-error 2 {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_default_messages.cpp b/test/OpenMP/distribute_parallel_for_default_messages.cpp
new file mode 100644
index 0000000000000..3437bd55cf3fb
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_default_messages.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, int N>
+T tmain(T argc) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp
new file mode 100644
index 0000000000000..0f5820ed4bbb8
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_dist_schedule_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..3e288c37d42ce
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_if_messages.cpp b/test/OpenMP/distribute_parallel_for_if_messages.cpp
new file mode 100644
index 0000000000000..c864340a46bfb
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_if_messages.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
new file mode 100644
index 0000000000000..745007fc48a8d
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute parallel for lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp distribute parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp b/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp
new file mode 100644
index 0000000000000..7939514249a07
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_num_threads_messages.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_private_messages.cpp b/test/OpenMP/distribute_parallel_for_private_messages.cpp
new file mode 100644
index 0000000000000..465357a433886
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp b/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp
new file mode 100644
index 0000000000000..9898f9dc65b69
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_proc_bind_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_reduction_messages.cpp b/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
new file mode 100644
index 0000000000000..f23a25e28c029
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_schedule_messages.cpp
new file mode 100644
index 0000000000000..6363cd7caef01
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_schedule_messages.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_shared_messages.cpp b/test/OpenMP/distribute_parallel_for_shared_messages.cpp
new file mode 100644
index 0000000000000..d5725e7f1b09d
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_shared_messages.cpp
@@ -0,0 +1,396 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = 1000;
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+return T();
+}
+
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = argc;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+return tmain<int, char, 1000>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 1000>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp
new file mode 100644
index 0000000000000..9c9f3dda281ab
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_aligned_messages.cpp
@@ -0,0 +1,306 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(z:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(X::x : ::z) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B,rp,::z: X::x) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp distribute parallel for simd aligned(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(B::ib,B:C1+C2) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(arr:L) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(num:4) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (i = 0; i < num; ++i);
+
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(ind2:LEN) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(h) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(i) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd aligned(v:16)
+      for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f:j) // expected-note {{initializer of 'j' is not a constant expression}} expected-error {{expression is not an integral constant expression}}
+
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argc) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (a, b) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(h)  // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp b/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
new file mode 100644
index 0000000000000..56809dcd8bd29
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_ast_print.cpp
@@ -0,0 +1,152 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute parallel for simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule(static, a) schedule(dynamic) default(none) copyin(g) firstprivate(a)
+  // CHECK: #pragma omp distribute parallel for simd dist_schedule(static, a) schedule(dynamic) default(none) copyin(g)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+	    a++;
+  // CHECK: #pragma omp distribute parallel for simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h) dist_schedule(static, N)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int x[200];
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  // CHECK: #pragma omp distribute parallel for simd schedule(guided, argc) default(none) copyin(g) dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) if (argc) num_threads(a) default(shared) shared(e) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      a++;
+  // CHECK: #pragma omp distribute parallel for simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) if(argc) num_threads(a) default(shared) shared(e) reduction(+: h) dist_schedule(static, b)
+  // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: a++;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x:8) linear(h:2) safelen(8) simdlen(8)
+  for (int i = 0; i < 100; i++)
+    for (int j = 0; j < 200; j++)
+      a += h + x[j];
+  // CHECK: #pragma omp distribute parallel for simd aligned(x: 8) linear(h: 2) safelen(8) simdlen(8)
+  // CHECK-NEXT: for (int i = 0; i < 100; i++)
+  // CHECK-NEXT: for (int j = 0; j < 200; j++)
+  // CHECK-NEXT: a += h + x[j];
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp
new file mode 100644
index 0000000000000..8690b4aac432a
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute parallel for simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp
new file mode 100644
index 0000000000000..2d1a3dc5124b8
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_copyin_messages.cpp
@@ -0,0 +1,190 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2 &operator=(S2 &s2) { return *this; }
+};
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+  S3 &operator=(S3 &s3) { return *this; }
+};
+class S4 {
+  int a;
+  S4();
+  S4 &operator=(const S4 &s4); // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+  S5 &operator=(const S5 &s5) { return *this; } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+template <class T>
+class ST {
+public:
+  static T s;
+};
+
+S2 k;
+S3 h;
+S4 l(3);
+S5 m(4);
+#pragma omp threadprivate(h, k, l, m)
+
+namespace A {
+double x;
+#pragma omp threadprivate(x)
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(l) // expected-error 2 {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(m) // expected-error 2 {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin // expected-error {{expected '(' after 'copyin'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(k // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(h, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(l) // expected-error {{'operator=' is a private member of 'S4'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(i) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(m) // expected-error {{'operator=' is a private member of 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd copyin(ST<int>::s, B::x) // expected-error {{copyin variable must be threadprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp
new file mode 100644
index 0000000000000..5c32306d96d16
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_default_messages.cpp
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, int N>
+T tmain(T argc) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error 2 {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp
new file mode 100644
index 0000000000000..2e3ee2b1d50e4
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_dist_schedule_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..07d30e48312bf
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp
new file mode 100644
index 0000000000000..01236b5563aba
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_if_messages.cpp
@@ -0,0 +1,179 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (parallel:argc) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd if(distribute : argc) // expected-error {{directive name modifier 'distribute' is not allowed for '#pragma omp distribute parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000000000..109fde0577c40
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute parallel for simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(i) // expected-note {{defined as lastprivate}}
+  for (i = 0; i < argc; ++i) // expected-error{{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be lastprivate, predetermined as linear}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp
new file mode 100644
index 0000000000000..632ef066c0fac
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_linear_messages.cpp
@@ -0,0 +1,338 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+  int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons()
+{
+  int B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:ib) // expected-error {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(z:B:ib) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B,::z, X::x)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L; // expected-note {{'ind2' defined here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(ind2:L) // expected-error {{argument of a linear clause should be of integral or pointer type}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int ind2 = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp parallel for simd linear(ind2:LEN) // expected-warning {{zero linear step (ind2 should probably be const)}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc : 5)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (a, b:B::ib) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int v = 0;
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(v:i)
+    for (int k = 0; k < argc; ++k) { i = k; v += i; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(v:j)
+  for (int k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (a, b) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(e, g) // expected-error {{argument of a linear clause should be of integral or pointer type, not 'S4'}} expected-error {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(i)
+      for (int k = 0; k < argc; ++k) ++k;
+
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute parallel for simd linear(i : 4)
+      for (int k = 0; k < argc; ++k) { ++k; i += 4; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp
new file mode 100644
index 0000000000000..6c322e6f60be0
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_loop_messages.cpp
@@ -0,0 +1,816 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp distribute parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp distribute parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// Ok to skip parenthesises.
+#pragma omp distribute parallel for simd
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp distribute parallel for simd
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be firstprivate, predetermined as linear}}
+#pragma omp distribute parallel for simd firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be private, predetermined as linear}}
+#pragma omp distribute parallel for simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be lastprivate, predetermined as linear}}
+#pragma omp distribute parallel for simd lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute parallel for simd' directive may not be threadprivate or thread local, predetermined as linear}}
+#pragma omp distribute parallel for simd
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp distribute parallel for simd
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+#pragma omp distribute parallel for simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp distribute parallel for simd
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp distribute parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp distribute parallel for simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+#pragma omp target
+#pragma omp teams
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp distribute parallel for simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp distribute parallel for simd
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // expected-error {{'try' statement cannot be used in OpenMP simd region}}
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      }
+      throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_ordered() {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd ordered // expected-error {{unexpected OpenMP clause 'ordered' in directive '#pragma omp distribute parallel for simd'}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
+void test_nowait() {
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd nowait nowait // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'nowait' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_misc_messages.c b/test/OpenMP/distribute_parallel_for_simd_misc_messages.c
new file mode 100644
index 0000000000000..01c079e3dac13
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_misc_messages.c
@@ -0,0 +1,971 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute parallel for simd'}}
+#pragma omp distribute parallel for simd foo
+
+void test_no_clause() {
+  int i;
+#pragma omp distribute parallel for simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+#pragma omp distribute parallel for simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd;
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd linear(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+#pragma omp distribute parallel for simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+void test_safelen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute parallel for simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute parallel for simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_collapse() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute parallel for simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute parallel for simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute parallel for simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute parallel for simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute parallel for simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd collapse(2)
+  for (i = 0; i < 16; ++i)
+    for (int j = 0; j < 16; ++j)
+// expected-error@+1 {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp distribute parallel for simd reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_linear() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd linear(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute parallel for simd linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute parallel for simd linear(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute parallel for simd linear(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd linear(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd linear(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd linear(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp distribute parallel for simd linear(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+1 {{private variable cannot be linear}}
+#pragma omp distribute parallel for simd private(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be private}}
+#pragma omp distribute parallel for simd linear(x) private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{zero linear step (x and other variables in clause should probably be const)}}
+#pragma omp distribute parallel for simd linear(x, y : 0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be lastprivate}}
+#pragma omp distribute parallel for simd linear(x) lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+1 {{lastprivate variable cannot be linear}}
+#pragma omp distribute parallel for simd lastprivate(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_aligned() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd aligned(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute parallel for simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute parallel for simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute parallel for simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int *x, y, z[25]; // expected-note 4 {{'y' defined here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(z)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd aligned(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd aligned(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd aligned(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a variable cannot appear in more than one aligned clause}}
+#pragma omp distribute parallel for simd aligned(x) aligned(z, x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{defined as aligned}}
+// expected-error@+2 {{a variable cannot appear in more than one aligned clause}}
+// expected-error@+1 2 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute parallel for simd aligned(x, y, z) aligned(y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+
+void test_private() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute parallel for simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute parallel for simd firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute parallel for simd firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp
new file mode 100644
index 0000000000000..0d6376d76b7ba
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_num_threads_messages.cpp
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
new file mode 100644
index 0000000000000..9df368890c8d5
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute parallel for simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp
new file mode 100644
index 0000000000000..6b64cc3879ea4
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_proc_bind_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  T i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return tmain<int, char, 3>(argc, argv);
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp
new file mode 100644
index 0000000000000..7b7e9ea53ccb4
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp
new file mode 100644
index 0000000000000..a5fd1aeb8436a
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_safelen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp
new file mode 100644
index 0000000000000..b3003dd9f2a1b
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_schedule_messages.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+3 {{argument to 'schedule' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
new file mode 100644
index 0000000000000..134b852579333
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_shared_messages.cpp
@@ -0,0 +1,396 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+template <class T, typename S, int N>
+T tmain(T argc, S **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = 1000;
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+return T();
+}
+
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int acc = 0;
+  int n = argc;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared // expected-error {{expected '(' after 'shared'}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared () // expected-error {{expected expression}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argc)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (S1) // expected-error {{'S1' does not refer to a value}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (a, b, c, d, f)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared (argv[1]) // expected-error {{expected variable name}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ba)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(ca)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(da)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(e, g)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd private(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd firstprivate(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(i)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd shared(j)
+  for(int k = 0 ; k < n ; k++) {
+    acc++;
+  }
+
+return tmain<int, char, 1000>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 1000>' requested here}}
+}
diff --git a/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp
new file mode 100644
index 0000000000000..2d813ec232adc
--- /dev/null
+++ b/test/OpenMP/distribute_parallel_for_simd_simdlen_messages.cpp
@@ -0,0 +1,181 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+5 {{expected ')'}} expected-note@+5 {{to match this '('}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+// expected-note@+3 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argc 
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+3 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) // expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'simdlen' clause}} expected-error 2 {{argument to 'simdlen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (N) // expected-error {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) // expected-error {{expression is not an integral constant expression}} expected-error 2 {{directive '#pragma omp distribute parallel for simd' cannot contain more than one 'simdlen' clause}} expected-error 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd simdlen(simdlen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  foo(); // expected-error {{statement after '#pragma omp distribute parallel for simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_private_messages.cpp b/test/OpenMP/distribute_private_messages.cpp
index 94ba4659c6841..518b64d9e9ca0 100644
--- a/test/OpenMP/distribute_private_messages.cpp
+++ b/test/OpenMP/distribute_private_messages.cpp
@@ -98,6 +98,7 @@ int main(int argc, char **argv) {
   #pragma omp target
   #pragma omp teams firstprivate(i)
   #pragma omp parallel private(i)
+  {}
   #pragma omp target
   #pragma omp teams reduction(+:i)
   #pragma omp distribute private(i)
@@ -113,20 +114,20 @@ int main(int argc, char **argv) {
   #pragma omp teams
   #pragma omp distribute firstprivate(i)
   for (int k = 0; k < 10; ++k) {
-    #pragma omp target
-    #pragma omp teams firstprivate(i)
-    #pragma omp distribute private(i)
-    for (int x = 0; x < 10; ++x) foo();
   }
   #pragma omp target
+  #pragma omp teams firstprivate(i)
+  #pragma omp distribute private(i)
+  for (int x = 0; x < 10; ++x) foo();
+  #pragma omp target
   #pragma omp teams reduction(+:i)
   #pragma omp distribute
   for (int k = 0; k < 10; ++k) {
-    #pragma omp target
-    #pragma omp teams reduction(+:i)
-    #pragma omp distribute private(i)
-    for (int x = 0; x < 10; ++x) foo();
   }
+  #pragma omp target
+  #pragma omp teams reduction(+:i)
+  #pragma omp distribute private(i)
+  for (int x = 0; x < 10; ++x) foo();
 
   return 0;
 }
diff --git a/test/OpenMP/distribute_simd_aligned_messages.cpp b/test/OpenMP/distribute_simd_aligned_messages.cpp
new file mode 100644
index 0000000000000..59e5be271d0bd
--- /dev/null
+++ b/test/OpenMP/distribute_simd_aligned_messages.cpp
@@ -0,0 +1,306 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(z:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(X::x : ::z) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B,rp,::z: X::x) // expected-error {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp distribute simd aligned(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(B::ib,B:C1+C2) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(arr:L) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(num:4) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (i = 0; i < num; ++i);
+
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(ind2:LEN) // expected-error {{argument to 'aligned' clause must be a strictly positive integer value}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(h) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(i) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (I k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd aligned(v:16)
+      for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f:j) // expected-note {{initializer of 'j' is not a constant expression}} expected-error {{expression is not an integral constant expression}}
+
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argc) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (a, b) // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}} expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(h)  // expected-error {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_simd_ast_print.cpp b/test/OpenMP/distribute_simd_ast_print.cpp
new file mode 100644
index 0000000000000..04358150b994a
--- /dev/null
+++ b/test/OpenMP/distribute_simd_ast_print.cpp
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp distribute simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, h;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule(static, a) firstprivate(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp distribute simd dist_schedule(static, a) firstprivate(a)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) reduction(+ : h) dist_schedule(static,N)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int k = 0; k < 10; ++k)
+        for (int m = 0; m < 10; ++m)
+          for (int n = 0; n < 10; ++n)
+            a++;
+// CHECK: #pragma omp distribute simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) reduction(+: h) dist_schedule(static, N)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: for (int j = 0; j < 2; ++j)
+// CHECK-NEXT: for (int k = 0; k < 10; ++k)
+// CHECK-NEXT: for (int m = 0; m < 10; ++m)
+// CHECK-NEXT: for (int n = 0; n < 10; ++n)
+// CHECK-NEXT: a++;
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int x[200];
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule(static, a) private(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK: #pragma omp distribute simd  dist_schedule(static, a) private(a)
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) reduction(+ : h) dist_schedule(static, b)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+            a++;
+// CHECK: #pragma omp distribute simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) reduction(+: h) dist_schedule(static, b)
+// CHECK-NEXT: for (int i = 0; i < 10; ++i)
+// CHECK-NEXT: for (int j = 0; j < 10; ++j)
+// CHECK-NEXT: a++;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x:8) linear(h:2) safelen(8) simdlen(8)
+  for (int i = 0; i < 100; i++)
+    for (int j = 0; j < 200; j++)
+      a += h + x[j];
+// CHECK: #pragma omp distribute simd aligned(x: 8) linear(h: 2) safelen(8) simdlen(8)
+// CHECK-NEXT: for (int i = 0; i < 100; i++)
+// CHECK-NEXT: for (int j = 0; j < 200; j++)
+// CHECK-NEXT: a += h + x[j];
+
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0]));
+}
+
+#endif
diff --git a/test/OpenMP/distribute_simd_collapse_messages.cpp b/test/OpenMP/distribute_simd_collapse_messages.cpp
new file mode 100644
index 0000000000000..182a09a92b6c8
--- /dev/null
+++ b/test/OpenMP/distribute_simd_collapse_messages.cpp
@@ -0,0 +1,154 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp distribute simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp distribute simd', but found only 1}}
+  // expected-error@+8 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+7 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+8 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+6{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+4 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+3 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+5 {{statement after '#pragma omp distribute simd' must be a for loop}}
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp distribute simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_simd_dist_schedule_messages.cpp b/test/OpenMP/distribute_simd_dist_schedule_messages.cpp
new file mode 100644
index 0000000000000..6a8482d8be09b
--- /dev/null
+++ b/test/OpenMP/distribute_simd_dist_schedule_messages.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+template <class T, int N>
+T tmain(T argc) {
+  T b = argc, c, d, e, f, g;
+  char ** argv;
+  static T a;
+// CHECK: static T a;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error3 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+  return T();
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule // expected-error {{expected '(' after 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule ( // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule () // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (argc)) // expected-error {{expected 'static' in OpenMP clause 'dist_schedule'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static), dist_schedule (static, 1) // expected-error {{directive '#pragma omp distribute simd' cannot contain more than one 'dist_schedule' clause}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i) foo();
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd dist_schedule (static, argv[1]=2) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i) foo();
+  return (tmain<int, 5>(argc) + tmain<char, 1>(argv[0][0])); // expected-note {{in instantiation of function template specialization 'tmain<int, 5>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<char, 1>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_firstprivate_messages.cpp b/test/OpenMP/distribute_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..b9267a3037f0b
--- /dev/null
+++ b/test/OpenMP/distribute_simd_firstprivate_messages.cpp
@@ -0,0 +1,359 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be firstprivate, predetermined as linear}}
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_lastprivate_messages.cpp b/test/OpenMP/distribute_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000000000..0f96cb4c3ec82
--- /dev/null
+++ b/test/OpenMP/distribute_simd_lastprivate_messages.cpp
@@ -0,0 +1,333 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  const S2 &operator =(const S2&) const;
+  S2 &operator =(const S2&);
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  int v = 0;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target
+#pragma omp teams private(i)
+#pragma omp distribute simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a)
+  for (int i = 0; i < 2; ++i)
+    foo();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(i) // expected-note {{defined as lastprivate}}
+  for (i = 0; i < argc; ++i) // expected-error{{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be lastprivate, predetermined as linear}}
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_linear_messages.cpp b/test/OpenMP/distribute_simd_linear_messages.cpp
new file mode 100644
index 0000000000000..c60e0a2470710
--- /dev/null
+++ b/test/OpenMP/distribute_simd_linear_messages.cpp
@@ -0,0 +1,338 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+  int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons()
+{
+  int B = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::ib:B:bfoo()) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:ib) // expected-error {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(z:B:ib) // expected-error {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B,::z, X::x)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(::z)
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::bfoo()) // expected-error {{expected variable name}}
+  for (int i = 0; i < 10; ++i) ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L; // expected-note {{'ind2' defined here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(ind2:L) // expected-error {{argument of a linear clause should be of integral or pointer type}}
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int ind2 = 0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp parallel for simd linear(ind2:LEN) // expected-warning {{zero linear step (ind2 should probably be const)}}
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc : 5)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (a, b:B::ib) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(e, g)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int v = 0;
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(v:i)
+    for (int k = 0; k < argc; ++k) { i = k; v += i; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  int v = 0;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(v:j)
+  for (int k = 0; k < argc; ++k) { ++k; v += j; }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (a, b) // expected-error {{linear variable with incomplete type 'S1'}} expected-error {{const-qualified variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(e, g) // expected-error {{argument of a linear clause should be of integral or pointer type, not 'S4'}} expected-error {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k) ++k;
+
+  #pragma omp parallel
+  {
+    int i;
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(i)
+      for (int k = 0; k < argc; ++k) ++k;
+
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd linear(i : 4)
+      for (int k = 0; k < argc; ++k) { ++k; i += 4; }
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(j)
+  for (int k = 0; k < argc; ++k) ++k;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(i)
+  for (int k = 0; k < argc; ++k) ++k;
+
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/distribute_simd_loop_messages.cpp b/test/OpenMP/distribute_simd_loop_messages.cpp
new file mode 100644
index 0000000000000..b69005578f493
--- /dev/null
+++ b/test/OpenMP/distribute_simd_loop_messages.cpp
@@ -0,0 +1,782 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i+=1) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (char i = 0; i < 10; i+='\1') {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+  #pragma omp distribute simd
+  for (long long i = 0; i < 10; i+=1.5) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (long long i = 0; i < 'z'; i+=1u) {
+    c[i] = a[i] + b[i];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{variable must be of integer or random access iterator type}}
+  #pragma omp distribute simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{variable must be of integer or random access iterator type}}
+  #pragma omp distribute simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{expression result unused}}
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (ii + 1;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (c[ii] = 0;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  // Ok to skip parenthesises.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (((ii)) = 0;ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+  #pragma omp distribute simd
+  for (int i = 0; ; i++)
+    c[i] = a[i];
+
+  // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+  // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+    // Ok.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ++ ++ ii)
+    c[ii] = a[ii];
+
+  // Ok but undefined behavior (in general, cannot check that incr
+  // is really loop-invariant).
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+  // Ok - step was converted to integer type.
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{relational comparison result unused}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; jj > kk + 2)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+3 {{expression result unused}}
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii) < 10; ii-=25)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii < 10); ii-=0)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii > 10; (ii+=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; ii < 10; (ii) = (1-1)+(ii))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for ((ii = 0); ii > 10; (ii-=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (ii = 0; (ii < 10); (ii-=0))
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+2  {{defined as private}}
+  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be private, predetermined as linear}}
+  #pragma omp distribute simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{unexpected OpenMP clause 'shared' in directive '#pragma omp distribute simd'}}
+  // expected-note@+2  {{defined as shared}}
+  // expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be shared, predetermined as linear}}
+  #pragma omp distribute simd shared(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd lastprivate(ii) linear(jj) collapse(2) // expected-note {{defined as linear}}
+  for (ii = 0; ii < 10; ii++)
+  for (jj = 0; jj < 10; jj++) // expected-error {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be linear, predetermined as lastprivate}}
+    c[ii] = a[jj];
+
+
+  #pragma omp parallel
+  {
+    #pragma omp target
+    #pragma omp teams
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp distribute simd' directive may not be threadprivate or thread local, predetermined as linear}}
+    #pragma omp distribute simd
+    for (sii = 0; sii < 10; sii+=1)
+      c[sii] = a[sii];
+  }
+
+  #pragma omp parallel
+  {
+    #pragma omp target
+    #pragma omp teams
+    #pragma omp distribute simd
+    for (globalii = 0; globalii < 10; globalii+=1)
+      c[globalii] = a[globalii];
+  }
+
+  #pragma omp parallel
+  {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{statement after '#pragma omp distribute simd' must be a for loop}}
+  #pragma omp distribute simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int (*lb)[4] = nullptr;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int (*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (int a{0}; a<10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag { };
+template <class Iter> struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+  public:
+    Iter0() { }
+    Iter0(const Iter0 &) { }
+    Iter0 operator ++() { return *this; }
+    Iter0 operator --() { return *this; }
+    Iter0 operator + (int delta) { return *this; }
+    bool operator <(Iter0 a) { return true; }
+};
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator -(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+  public:
+    Iter1(float f=0.0f, double d=0.0) { }
+    Iter1(const Iter1 &) { }
+    Iter1 operator ++() { return *this; }
+    Iter1 operator --() { return *this; }
+    bool operator <(Iter1 a) { return true; }
+    bool operator >=(Iter1 a) { return false; }
+};
+class GoodIter {
+  public:
+    GoodIter() { }
+    GoodIter(const GoodIter &) { }
+    GoodIter(int fst, int snd) { }
+    GoodIter &operator =(const GoodIter &that) { return *this; }
+    GoodIter &operator =(const Iter0 &that) { return *this; }
+    GoodIter &operator +=(int x) { return *this; }
+    explicit GoodIter(void *) { }
+    GoodIter operator ++() { return *this; }
+    GoodIter operator --() { return *this; }
+    bool operator !() { return true; }
+    bool operator <(GoodIter a) { return true; }
+    bool operator <=(GoodIter a) { return true; }
+    bool operator >=(GoodIter a) { return false; }
+    typedef int difference_type;
+    typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator -(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 2 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator -(GoodIter a) { return a; }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator -(GoodIter a, int v) { return GoodIter(); }
+GoodIter operator +(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator -(int v, GoodIter a) { return GoodIter(); }
+GoodIter operator +(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (GoodIter I(1,2); I < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = GoodIter(1,2); begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+  #pragma omp distribute simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+
+  #pragma omp target
+  #pragma omp teams
+  // Initializer is constructor without params.
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+
+  Iter1 begin1, end1;
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+  // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  #pragma omp distribute simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+
+  // Initializer is constructor with all default params.
+  #pragma omp target
+  #pragma omp teams
+  // expected-error@+4 {{invalid operands to binary expression ('Iter1' and 'float')}}
+  // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+  // expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+  #pragma omp distribute simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+
+  return 0;
+}
+
+template <typename IT, int ST> class TC {
+  public:
+    int dotest_lt(IT begin, IT end) {
+      // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+      // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+      #pragma omp distribute simd
+      for (IT I = begin; I < end; I = I + ST) {
+        ++I;
+      }
+      #pragma omp target
+      #pragma omp teams
+      // expected-note@+3 {{loop step is expected to be positive due to this condition}}
+      // expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+      #pragma omp distribute simd
+      for (IT I = begin; I <= end; I += ST) {
+        ++I;
+      }
+      #pragma omp distribute simd
+      for (IT I = begin; I < end; ++I) {
+        ++I;
+      }
+    }
+
+    static IT step() {
+      return IT(ST);
+    }
+};
+template <typename IT, int ST=0> int dotest_gt(IT begin, IT end) {
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  // expected-note@+3 {{loop step is expected to be negative due to this condition}}
+  // expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+  #pragma omp distribute simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (IT I = begin; I < end; I+=TC<int,ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end); // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end); // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch(i) {
+      case 1:
+        b[i]++;
+        break;
+      default:
+        break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try { // expected-error {{'try' statement cannot be used in OpenMP simd region}}
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      }
+      throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+    catch (float f) {
+      if (f > 0.1)
+        throw a[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch(i) {
+      case 1:
+        b[i]++;
+        break;
+      default:
+        break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i]; // expected-error {{'throw' statement cannot be used in OpenMP simd region}}
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+  #pragma omp target
+  #pragma omp teams
+  #pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
diff --git a/test/OpenMP/distribute_simd_misc_messages.c b/test/OpenMP/distribute_simd_misc_messages.c
new file mode 100644
index 0000000000000..5fc2cb64cc62b
--- /dev/null
+++ b/test/OpenMP/distribute_simd_misc_messages.c
@@ -0,0 +1,1108 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd foo
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp distribute simd'}}
+#pragma omp distribute simd safelen(4)
+
+void test_no_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{statement after '#pragma omp distribute simd' must be a for loop}}
+#pragma omp distribute simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd;
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+#pragma omp distribute simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+void test_safelen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// xxpected-error@+1 {{expected expression}}
+#pragma omp distribute simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp distribute simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp distribute simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_collapse() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp distribute simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// xxpected-error@+1 {{expected expression}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp distribute simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp distribute simd', but found only 1}}
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp distribute simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp distribute simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+3 {{defined as reduction}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd collapse(2) reduction(+ : i)
+  for (i = 0; i < 16; ++i)
+    // expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+
+#pragma omp target
+#pragma omp teams
+  for (i = 0; i < 16; ++i)
+    for (int j = 0; j < 16; ++j)
+#pragma omp distribute simd reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_linear() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd linear(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute simd linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute simd linear(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute simd linear(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd linear(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd linear(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be linear}}
+#pragma omp distribute simd linear(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as private}}
+// expected-error@+1 {{private variable cannot be linear}}
+#pragma omp distribute simd private(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be private}}
+#pragma omp distribute simd linear(x) private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-warning@+1 {{zero linear step (x and other variables in clause should probably be const)}}
+#pragma omp distribute simd linear(x, y : 0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as linear}}
+// expected-error@+1 {{linear variable cannot be lastprivate}}
+#pragma omp distribute simd linear(x) lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as lastprivate}}
+// expected-error@+1 {{lastprivate variable cannot be linear}}
+#pragma omp distribute simd lastprivate(x) linear(x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_aligned() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd aligned(0)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{use of undeclared identifier 'x'}}
+#pragma omp distribute simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{use of undeclared identifier 'x'}}
+// expected-error@+1 {{use of undeclared identifier 'y'}}
+#pragma omp distribute simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{use of undeclared identifier 'x'}}
+// expected-error@+2 {{use of undeclared identifier 'y'}}
+// expected-error@+1 {{use of undeclared identifier 'z'}}
+#pragma omp distribute simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int *x, y, z[25]; // expected-note 4 {{'y' defined here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(z)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd aligned(x :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x :, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd aligned(x : 2 * 2)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x : 1, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd aligned(x : 1, y, z : 1)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+2 {{defined as aligned}}
+// expected-error@+1 {{a variable cannot appear in more than one aligned clause}}
+#pragma omp distribute simd aligned(x) aligned(z, x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-note@+3 {{defined as aligned}}
+// expected-error@+2 {{a variable cannot appear in more than one aligned clause}}
+// expected-error@+1 2 {{argument of aligned clause should be array or pointer, not 'int'}}
+#pragma omp distribute simd aligned(x, y, z) aligned(y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_private() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp distribute simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_firstprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_lastprivate() {
+  int i;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 2 {{expected expression}}
+#pragma omp distribute simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_reduction() {
+  int i, x, y;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction()
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{expected expression}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected identifier}}
+#pragma omp distribute simd reduction( : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected identifier}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(,
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 {{expected expression}}
+// expected-warning@+1 {{missing ':' after reduction identifier - ignoring}}
+#pragma omp distribute simd reduction(+
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+//
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+:
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ :)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ :, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected expression}}
+#pragma omp distribute simd reduction(+ : x, + : y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected identifier}}
+#pragma omp distribute simd reduction(% : x)
+  for (i = 0; i < 16; ++i)
+    ;
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : x)
+  for (i = 0; i < 16; ++i)
+    ;
+  struct X {
+    int x;
+  };
+  struct X X;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd reduction(+ : X.x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+// expected-error@+1 {{expected variable name}}
+#pragma omp distribute simd reduction(+ : x + x)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+#pragma omp target
+#pragma omp teams
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp distribute simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
+void linear_modifiers(int argc) {
+  int f;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(f)
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(val(f))
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(uval(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(ref(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd linear(foo(f)) // expected-error {{expected 'val' modifier}}
+  for (int k = 0; k < argc; ++k) ++k;
+}
+
diff --git a/test/OpenMP/distribute_simd_private_messages.cpp b/test/OpenMP/distribute_simd_private_messages.cpp
new file mode 100644
index 0000000000000..c777c9971d4b5
--- /dev/null
+++ b/test/OpenMP/distribute_simd_private_messages.cpp
@@ -0,0 +1,315 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd nowait // expected-error {{unexpected OpenMP clause 'nowait' in directive '#pragma omp distribute simd'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/distribute_simd_reduction_messages.cpp b/test/OpenMP/distribute_simd_reduction_messages.cpp
new file mode 100644
index 0000000000000..e03b852928082
--- /dev/null
+++ b/test/OpenMP/distribute_simd_reduction_messages.cpp
@@ -0,0 +1,441 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/distribute_simd_safelen_messages.cpp b/test/OpenMP/distribute_simd_safelen_messages.cpp
new file mode 100644
index 0000000000000..4ae35fb2334da
--- /dev/null
+++ b/test/OpenMP/distribute_simd_safelen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/distribute_simd_simdlen_messages.cpp b/test/OpenMP/distribute_simd_simdlen_messages.cpp
new file mode 100644
index 0000000000000..4ae35fb2334da
--- /dev/null
+++ b/test/OpenMP/distribute_simd_simdlen_messages.cpp
@@ -0,0 +1,177 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argc  // expected-note {{to match this '('}} expected-error 2 {{expression is not an integral constant expression}} expected-note 2 {{read of non-const variable 'argc' is not allowed in a constant expression}} expected-error {{expected ')'}}
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+  
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (ST // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp distribute simd' are ignored}}
+  for (int i = ST; i < N; i++)
+     argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) 
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{directive '#pragma omp distribute simd' cannot contain more than one 'safelen' clause}} expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{expression is not an integral constant expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (4)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+  
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) // expected-error 2 {{argument to 'safelen' clause must be a strictly positive integer value}} expected-error 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}} expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target
+#pragma omp teams
+#pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+4 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+3 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo(); // expected-error {{statement after '#pragma omp distribute simd' must be a for loop}}
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/driver.c b/test/OpenMP/driver.c
index f84541bef8ba0..74aaea5071966 100644
--- a/test/OpenMP/driver.c
+++ b/test/OpenMP/driver.c
@@ -8,3 +8,22 @@
 // CHECK-NO-TLS: -cc1
 // CHECK-NO-TLS-SAME: -fnoopenmp-use-tls
 //
+// RUN: %clang %s -c -E -dM -fopenmp=libomp | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=1 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=0 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=100 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=31 | FileCheck --check-prefix=CHECK-DEFAULT-VERSION %s
+// CHECK-DEFAULT-VERSION: #define _OPENMP 201107
+
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=40 | FileCheck --check-prefix=CHECK-40-VERSION %s
+// CHECK-40-VERSION: #define _OPENMP 201307
+
+// RUN: %clang %s -c -E -dM -fopenmp=libomp -fopenmp-version=45 | FileCheck --check-prefix=CHECK-45-VERSION %s
+// CHECK-45-VERSION: #define _OPENMP 201511
+
+// RUN: %clang %s -c -E -dM -fopenmp-version=1 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=31 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=40 | FileCheck --check-prefix=CHECK-VERSION %s
+// RUN: %clang %s -c -E -dM -fopenmp-version=45 | FileCheck --check-prefix=CHECK-VERSION %s
+// CHECK-VERSION-NOT: #define _OPENMP
+
diff --git a/test/OpenMP/dump.cpp b/test/OpenMP/dump.cpp
new file mode 100644
index 0000000000000..378b53ce5bf19
--- /dev/null
+++ b/test/OpenMP/dump.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-dump %s | FileCheck %s
+// expected-no-diagnostics
+
+int ga, gb;
+#pragma omp threadprivate(ga, gb)
+
+// CHECK:      |-OMPThreadPrivateDecl {{.+}} <col:9> col:9
+// CHECK-NEXT: | |-DeclRefExpr {{.+}} <col:27> 'int' lvalue Var {{.+}} 'ga' 'int'
+// CHECK-NEXT: | `-DeclRefExpr {{.+}} <col:31> 'int' lvalue Var {{.+}} 'gb' 'int'
+
+#pragma omp declare reduction(+ : int, char : omp_out *= omp_in)
+
+#pragma omp declare reduction(fun : float : omp_out += omp_in) initializer(omp_priv = omp_orig + 15)
+
+// CHECK:      |-OMPDeclareReductionDecl {{.+}} <line:11:35> col:35 operator+ 'int' combiner
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:47, col:58> 'int' lvalue '*=' ComputeLHSTy='int' ComputeResultTy='int'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:47> 'int' lvalue Var {{.+}} 'omp_out' 'int'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:58> 'int' <LValueToRValue>
+// CHECK-NEXT: | |   `-DeclRefExpr {{.+}} <col:58> 'int' lvalue Var {{.+}} 'omp_in' 'int'
+// CHECK-NEXT: | |-VarDecl {{.+}} <col:35> col:35 implicit used omp_in 'int'
+// CHECK-NEXT: | `-VarDecl {{.+}} <col:35> col:35 implicit used omp_out 'int'
+// CHECK-NEXT: |-OMPDeclareReductionDecl {{.+}} <col:40> col:40 operator+ 'char' combiner
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:47, col:58> 'char' lvalue '*=' ComputeLHSTy='int' ComputeResultTy='int'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:47> 'char' lvalue Var {{.+}} 'omp_out' 'char'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:58> 'int' <IntegralCast>
+// CHECK-NEXT: | |   `-ImplicitCastExpr {{.+}} <col:58> 'char' <LValueToRValue>
+// CHECK-NEXT: | |     `-DeclRefExpr {{.+}} <col:58> 'char' lvalue Var {{.+}} 'omp_in' 'char'
+// CHECK-NEXT: | |-VarDecl {{.+}} <col:40> col:40 implicit used omp_in 'char'
+// CHECK-NEXT: | `-VarDecl {{.+}} <col:40> col:40 implicit used omp_out 'char'
+// CHECK-NEXT: |-OMPDeclareReductionDecl {{.+}} <line:13:37> col:37 fun 'float' combiner initializer
+// CHECK-NEXT: | |-CompoundAssignOperator {{.+}} <col:45, col:56> 'float' lvalue '+=' ComputeLHSTy='float' ComputeResultTy='float'
+// CHECK-NEXT: | | |-DeclRefExpr {{.+}} <col:45> 'float' lvalue Var {{.+}} 'omp_out' 'float'
+// CHECK-NEXT: | | `-ImplicitCastExpr {{.+}} <col:56> 'float' <LValueToRValue>
+// CHECK-NEXT: | |   `-DeclRefExpr {{.+}} <col:56> 'float' lvalue Var {{.+}} 'omp_in' 'float'
+
+struct S {
+  int a, b;
+  S() {
+#pragma omp parallel for default(none) private(a) shared(b) schedule(static, a)
+    for (int i = 0; i < 0; ++i)
+      ++a;
+  }
+};
+
+// CHECK:      |     `-OMPParallelForDirective {{.+}} <line:39:9, col:80>
+// CHECK-NEXT: |       |-OMPDefaultClause {{.+}} <col:26, col:40>
+// CHECK-NEXT: |       |-OMPPrivateClause {{.+}} <col:40, col:51>
+// CHECK-NEXT: |       | `-DeclRefExpr {{.+}} <col:48> 'int' lvalue OMPCapturedExpr {{.+}} 'a' 'int &'
+// CHECK-NEXT: |       |-OMPSharedClause {{.+}} <col:51, col:61>
+// CHECK-NEXT: |       | `-MemberExpr {{.+}} <col:58> 'int' lvalue ->b
+// CHECK-NEXT: |       |   `-CXXThisExpr {{.+}} <col:58> 'struct S *' this
+// CHECK-NEXT: |       |-OMPScheduleClause {{.+}} <col:61, col:79>
+// CHECK-NEXT: |       | `-ImplicitCastExpr {{.+}} <col:78> 'int' <LValueToRValue>
+// CHECK-NEXT: |       |   `-DeclRefExpr {{.+}} <col:78> 'int' lvalue OMPCapturedExpr {{.+}} '.capture_expr.' 'int'
+// CHECK-NEXT: |       |-CapturedStmt {{.+}} <line:40:5, <invalid sloc>>
+// CHECK-NEXT: |       | |-CapturedDecl {{.+}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: |       | | |-ForStmt {{.+}} <col:5, <invalid sloc>>
+// CHECK:      |       | | | `-UnaryOperator {{.+}} <line:41:7, <invalid sloc>> 'int' lvalue prefix '++'
+// CHECK-NEXT: |       | | |   `-DeclRefExpr {{.+}} <<invalid sloc>> 'int' lvalue OMPCapturedExpr {{.+}} 'a' 'int &'
+
+#pragma omp declare simd
+#pragma omp declare simd inbranch
+void foo();
+
+// CHECK:      `-FunctionDecl {{.+}} <line:63:1, col:10> col:6 foo 'void (void)'
+// CHECK-NEXT:   |-OMPDeclareSimdDeclAttr {{.+}} <line:62:9, col:34> Implicit BS_Inbranch
+// CHECK:        `-OMPDeclareSimdDeclAttr {{.+}} <line:61:9, col:25> Implicit BS_Undefined
+
diff --git a/test/OpenMP/for_ast_print.cpp b/test/OpenMP/for_ast_print.cpp
index 3510b904b1c76..182b395c13ca1 100644
--- a/test/OpenMP/for_ast_print.cpp
+++ b/test/OpenMP/for_ast_print.cpp
@@ -8,6 +8,94 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  T &b;
+  typename T::type c:12;
+  typename T::type &d;
+  S7() : a(0), b(a), c(0), d(a.a) {}
+
+public:
+  S7(typename T::type v) : a(v), b(a), c(v), d(a.a) {
+#pragma omp for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a) lastprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for linear(val(c))
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for linear(uval(this->b))
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(this->S::a)
+// CHECK: #pragma omp for linear(val(this->c))
+// CHECK: #pragma omp for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(T::a)
+// CHECK: #pragma omp for linear(val(this->c))
+// CHECK: #pragma omp for private(this->a) private(this->a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a)
+// CHECK: #pragma omp for linear(uval(this->b))
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a) lastprivate(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp for linear(ref(S7<S>::d))
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for lastprivate(a) lastprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp for linear(this->c)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a) lastprivate(this->S7<S>::a)
+// CHECK: #pragma omp for linear(ref(this->S7<S>::d))
+// CHECK: #pragma omp for private(this->a) private(this->a)
+// CHECK: #pragma omp for lastprivate(this->a) lastprivate(this->a)
+// CHECK: #pragma omp for linear(this->c)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
diff --git a/test/OpenMP/for_codegen.cpp b/test/OpenMP/for_codegen.cpp
index 98761f56c7620..1d24403a09fbf 100644
--- a/test/OpenMP/for_codegen.cpp
+++ b/test/OpenMP/for_codegen.cpp
@@ -98,8 +98,8 @@ void static_not_chunked(float *a, float *b, float *c, float *d) {
 // CHECK-LABEL: define {{.*void}} @{{.*}}static_chunked{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void static_chunked(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
-  #pragma omp for schedule(static, 5)
-// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 33, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
+  #pragma omp for schedule(monotonic: static, 5)
+// CHECK: call void @__kmpc_for_static_init_4u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 536870945, i32* [[IS_LAST:%[^,]+]], i32* [[OMP_LB:%[^,]+]], i32* [[OMP_UB:%[^,]+]], i32* [[OMP_ST:%[^,]+]], i32 1, i32 5)
 // UB = min(UB, GlobalUB)
 // CHECK: [[UB:%.+]] = load i32, i32* [[OMP_UB]]
 // CHECK-NEXT: [[UBCMP:%.+]] = icmp ugt i32 [[UB]], 16908288
@@ -158,8 +158,8 @@ void static_chunked(float *a, float *b, float *c, float *d) {
 // CHECK-LABEL: define {{.*void}} @{{.*}}dynamic1{{.*}}(float* {{.+}}, float* {{.+}}, float* {{.+}}, float* {{.+}})
 void dynamic1(float *a, float *b, float *c, float *d) {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:[@%].+]])
-  #pragma omp for schedule(dynamic)
-// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 35, i64 0, i64 16908287, i64 1, i64 1)
+  #pragma omp for schedule(nonmonotonic: dynamic)
+// CHECK: call void @__kmpc_dispatch_init_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32 1073741859, i64 0, i64 16908287, i64 1, i64 1)
 //
 // CHECK: [[HASWORK:%.+]] = call i32 @__kmpc_dispatch_next_8u([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]], i32* [[OMP_ISLAST:%[^,]+]], i64* [[OMP_LB:%[^,]+]], i64* [[OMP_UB:%[^,]+]], i64* [[OMP_ST:%[^,]+]])
 // CHECK-NEXT: [[O_CMP:%.+]] = icmp ne i32 [[HASWORK]], 0
@@ -327,12 +327,13 @@ void runtime(float *a, float *b, float *c, float *d) {
 // CHECK-LABEL: test_precond
 void test_precond() {
   // CHECK: [[A_ADDR:%.+]] = alloca i8,
+  // CHECK: [[CAP:%.+]] = alloca i8,
   // CHECK: [[I_ADDR:%.+]] = alloca i8,
   char a = 0;
   // CHECK: store i8 0,
   // CHECK: store i32
   // CHECK: store i8
-  // CHECK: [[A:%.+]] = load i8, i8* [[A_ADDR]],
+  // CHECK: [[A:%.+]] = load i8, i8* [[CAP]],
   // CHECK: [[CONV:%.+]] = sext i8 [[A]] to i32
   // CHECK: [[CMP:%.+]] = icmp slt i32 [[CONV]], 10
   // CHECK: br i1 [[CMP]], label %[[PRECOND_THEN:[^,]+]], label %[[PRECOND_END:[^,]+]]
@@ -491,4 +492,25 @@ void loop_with_stmt_expr() {
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
 
+
+// CHECK-LABEL: fint
+// CHECK: call {{.*}}i32 {{.*}}ftemplate
+// CHECK: ret i32
+
+// CHECK: load i16, i16*
+// CHECK: store i16 %
+// CHECK: call void {{.+}}@__kmpc_fork_call(
+// CHECK: call void @__kmpc_for_static_init_4(
+template <typename T>
+T ftemplate() {
+  short aa = 0;
+
+#pragma omp parallel for schedule(static, aa)
+  for (int i = 0; i < 100; i++) {
+  }
+  return T();
+}
+
+int fint(void) { return ftemplate<int>(); }
+
 #endif // HEADER
diff --git a/test/OpenMP/for_collapse_messages.cpp b/test/OpenMP/for_collapse_messages.cpp
index d40c305c6271b..a6fdf0066592c 100644
--- a/test/OpenMP/for_collapse_messages.cpp
+++ b/test/OpenMP/for_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
   #pragma omp for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
-  #pragma omp for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp for collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for' must be a for loop}}
diff --git a/test/OpenMP/for_firstprivate_codegen.cpp b/test/OpenMP/for_firstprivate_codegen.cpp
index 01a93559451a0..3c6f37229f22e 100644
--- a/test/OpenMP/for_firstprivate_codegen.cpp
+++ b/test/OpenMP/for_firstprivate_codegen.cpp
@@ -95,7 +95,7 @@ int main() {
     // LAMBDA: [[SIVAR2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR_REF]]
     // LAMBDA: store i{{[0-9]+}} [[SIVAR2_VAL]], i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR]]
 
-    // LAMBDA: call void @__kmpc_barrier(
+    // LAMBDA-NOT: call void @__kmpc_barrier(
     g = 1;
     g1 = 1;
     sivar = 2;
@@ -158,7 +158,7 @@ int main() {
     // BLOCKS: [[SIVAR2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF_ADDRR]]
     // BLOCKS: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[SIVAR2_PRIVATE_ADDR]]
 
-    // BLOCKS: call void @__kmpc_barrier(
+    // BLOCKS-NOT: call void @__kmpc_barrier(
     g = 1;
     g1 =1;
     sivar = 2;
@@ -246,7 +246,7 @@ int main() {
 // CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIV]]
 
 // Synchronization for initialization.
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
@@ -262,31 +262,38 @@ int main() {
 
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[TVAR:%.+]] = alloca i32,
+// CHECK: [[TVAR_CAST:%.+]] = alloca i64,
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[TVAR_VAL:%.+]] = load i32, i32* [[TVAR]],
+// CHECK: [[TVAR_CONV:%.+]] = bitcast i64* [[TVAR_CAST]] to i32*
+// CHECK: store i32 [[TVAR_VAL]], i32* [[TVAR_CONV]],
+// CHECK: [[PVT_CASTVAL:%[^,]+]] = load i64, i64* [[TVAR_CAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void  (i32*, i32*, ...)*), i64 [[PVT_CASTVAL]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 {{.*}}%{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // Skip temp vars for loop
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
-// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: %{{.+}} = bitcast i64* [[T_VAR_PRIV]] to i32*
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
+// CHECK: [[VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
-// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK-NOT: load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
 
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
@@ -310,10 +317,8 @@ int main() {
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// Synchronization for initialization.
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// No synchronization for initialization.
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
diff --git a/test/OpenMP/for_firstprivate_messages.cpp b/test/OpenMP/for_firstprivate_messages.cpp
index 1933de25850e6..60be4f5f2a2b1 100644
--- a/test/OpenMP/for_firstprivate_messages.cpp
+++ b/test/OpenMP/for_firstprivate_messages.cpp
@@ -143,7 +143,7 @@ int foomain(int argc, char **argv) {
     foo();
 #pragma omp parallel reduction(+ : i) // expected-note {{defined as reduction}}
 #pragma omp for firstprivate(i)       // expected-error {{firstprivate variable must be shared}}
-  for (i = 0; i < argc; ++i)
+  for (int k = 0; k < argc; ++k)
     foo();
   return 0;
 }
diff --git a/test/OpenMP/for_lastprivate_codegen.cpp b/test/OpenMP/for_lastprivate_codegen.cpp
index ea559b08eae0c..2b1d6c3cf97f4 100644
--- a/test/OpenMP/for_lastprivate_codegen.cpp
+++ b/test/OpenMP/for_lastprivate_codegen.cpp
@@ -8,6 +8,115 @@
 #ifndef HEADER
 #define HEADER
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(a, b, c)
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+#pragma omp for
+    for (a = 0; a < 2; ++a)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for lastprivate(b)
+        for (b = 0; b < 2; ++b)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for
+        for (c = 0; c < 2; ++c)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template <typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        [&]() {
+          ++this->a;
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ^{
+          ++a;
+#pragma omp parallel
+#pragma omp for lastprivate(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#else
+      ++(this)->a;
+#endif
+#pragma omp for
+    for (a = 0; a < 2; ++a)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a;
+#pragma omp parallel
+#pragma omp for
+        for (a = 0; a < 2; ++(this)->a)
+          ++(this)->a;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+#pragma omp parallel
+#pragma omp for
+        for (this->a = 0; a < 2; ++a)
+          ++(this)->a;
+      }();
+#else
+      ++(this)->a;
+#endif
+  }
+};
+
 template <class T>
 struct S {
   T f;
@@ -23,6 +132,9 @@ volatile int &g1 = g;
 float f;
 char cnt;
 
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
@@ -32,6 +144,7 @@ char cnt;
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -54,17 +167,75 @@ using A::x;
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* %{{.+}})
 #pragma omp parallel
 #pragma omp for lastprivate(g, g1, sivar)
   for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: call void {{.+}} [[SS_LAMBDA:@[^ ]+]]
+    // LAMBDA: call void @__kmpc_for_static_fini(%
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) [[SIVAR:%.+]])
     // LAMBDA: alloca i{{[0-9]+}},
     // LAMBDA: alloca i{{[0-9]+}},
@@ -128,6 +299,7 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -191,6 +363,60 @@ int main() {
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: call void
+// BLOCKS: call void @__kmpc_for_static_fini(%
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+// BLOCKS: call{{.*}} void
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: br label
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: br label
+// BLOCKS: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -414,7 +640,52 @@ int main() {
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
-//
+
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void @__kmpc_for_static_fini(%
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: br i1
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: br label
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
diff --git a/test/OpenMP/for_linear_codegen.cpp b/test/OpenMP/for_linear_codegen.cpp
index db9788361dd0f..0ad45f5b7054f 100644
--- a/test/OpenMP/for_linear_codegen.cpp
+++ b/test/OpenMP/for_linear_codegen.cpp
@@ -23,6 +23,74 @@ volatile int &g1 = g;
 float f;
 char cnt;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel
+#pragma omp for linear(a, b, c)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for linear(a, b) linear(ref(c))
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ++a;
+        --this->b;
+        (this)->c /= 1;
+#pragma omp parallel
+#pragma omp for linear(a, b) linear(uval(c))
+        for (int i = 0; i < 2; ++i)
+          ++(this)->a, --b, this->c /= 1;
+      }();
+#else
+      ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template <typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel
+#pragma omp for linear(a)
+    for (int i = 0; i < 2; ++i)
+#ifdef LAMBDA
+      [&]() {
+        [&]() {
+          ++this->a;
+#pragma omp parallel
+#pragma omp for linear(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#elif defined(BLOCKS)
+      ^{
+        ^{
+          ++a;
+#pragma omp parallel
+#pragma omp for linear(a)
+          for (int i = 0; i < 2; ++i)
+            ++(this)->a;
+        }();
+      }();
+#else
+      ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
@@ -31,6 +99,7 @@ char cnt;
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T *pvar = &test.f;
   T &lvar = test.f;
 #pragma omp parallel
@@ -42,16 +111,75 @@ T tmain() {
 }
 
 int main() {
+  static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call void [[OUTER_LAMBDA:@.+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 0, {{.+}}* [[OMP_REGION:@.+]] to {{.+}})
 #pragma omp parallel
 #pragma omp for linear(g, g1:5)
   for (int i = 0; i < 2; ++i) {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: alloca i{{[0-9]+}},
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: call void @__kmpc_for_static_init_4(
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call void @__kmpc_for_static_fini(
+    // LAMBDA: br i1
+    // LAMBDA: br label
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
     // LAMBDA: alloca i{{[0-9]+}},
     // LAMBDA: [[G_START_ADDR:%.+]] = alloca i{{[0-9]+}},
@@ -96,6 +224,7 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -146,6 +275,60 @@ int main() {
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 2
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]*
+// BLOCKS: call{{.*}} void
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: br label
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: alloca i{{[0-9]+}},
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: call void @__kmpc_for_static_init_4(
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call void @__kmpc_for_static_fini(
+// BLOCKS: br i1
+// BLOCKS: br label
+// BLOCKS: ret void
 #else
   S<float> test;
   float *pvar = &test.f;
@@ -216,7 +399,51 @@ int main() {
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 2, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32**, i32*)* [[TMAIN_MICROTASK:@.+]] to void
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
-//
+
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: br i1
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: br label
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32** dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}})
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: [[PVAR_START:%.+]] = alloca i32*,
diff --git a/test/OpenMP/for_linear_messages.cpp b/test/OpenMP/for_linear_messages.cpp
index 39fb21ef7be0a..ab8934979ac30 100644
--- a/test/OpenMP/for_linear_messages.cpp
+++ b/test/OpenMP/for_linear_messages.cpp
@@ -212,7 +212,7 @@ int main(int argc, char **argv) {
   #pragma omp for linear(i) ordered(1) // expected-error {{'linear' clause cannot be specified along with 'ordered' clause with a parameter}}
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{n instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/for_loop_messages.cpp b/test/OpenMP/for_loop_messages.cpp
index 895baf57e9ccc..bb58a77e5adeb 100644
--- a/test/OpenMP/for_loop_messages.cpp
+++ b/test/OpenMP/for_loop_messages.cpp
@@ -426,12 +426,25 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+class GoodIter1 {
+public:
+  GoodIter1() {}
+  GoodIter1(const GoodIter1 &) {}
+  GoodIter1 &operator++(int) { return *this; }
+  GoodIter1 &operator=(const GoodIter1 &that) { return *this; }
+  GoodIter1 &operator+=(int x) { return *this; }
+  friend long operator-(const GoodIter1 &, const GoodIter1 &);
+  GoodIter1 &operator-(int) { return *this; }
+  bool operator<(GoodIter1 a) { return true; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -482,7 +495,7 @@ int test_with_random_access_iterator() {
 #pragma omp for
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp for
@@ -572,6 +585,10 @@ int test_with_random_access_iterator() {
 #pragma omp for
   for (Iter1 I; I < end1; ++I) {
   }
+  GoodIter1 I1, E1;
+#pragma omp for
+  for (GoodIter1 I = I1; I < E1; I++)
+    ;
   return 0;
 }
 
diff --git a/test/OpenMP/for_ordered_clause.cpp b/test/OpenMP/for_ordered_clause.cpp
index 8af509ab9aa87..3335f4046f571 100644
--- a/test/OpenMP/for_ordered_clause.cpp
+++ b/test/OpenMP/for_ordered_clause.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -36,16 +41,23 @@ T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}
 #pragma omp for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for', but found only 1}}
-// expected-error@+3 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
-// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+// expected-error@+6 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
+// expected-error@+5 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 #pragma omp for ordered(foobool(argc)), ordered(true), ordered(-5)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
 #pragma omp for ordered(S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
@@ -84,10 +96,17 @@ int main(int argc, char **argv) {
 #pragma omp for ordered(2 + 2))              // expected-warning {{extra tokens at the end of '#pragma omp for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];    // expected-error {{expected 4 for loops after '#pragma omp for', but found only 1}}
-#pragma omp for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+// expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp for ordered(foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 // expected-error@+2 2 {{directive '#pragma omp for' cannot contain more than one 'ordered' clause}}
 // expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
 #pragma omp for ordered(foobool(argc)), ordered(true), ordered(-5)
@@ -96,7 +115,11 @@ int main(int argc, char **argv) {
 #pragma omp for ordered(S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
diff --git a/test/OpenMP/for_private_messages.cpp b/test/OpenMP/for_private_messages.cpp
index 3015f819b5447..4045c5b56ff5c 100644
--- a/test/OpenMP/for_private_messages.cpp
+++ b/test/OpenMP/for_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp for private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@ int main(int argc, char **argv) {
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/for_reduction_codegen.cpp b/test/OpenMP/for_reduction_codegen.cpp
index 423ab3ce76d0a..6997d8138842c 100644
--- a/test/OpenMP/for_reduction_codegen.cpp
+++ b/test/OpenMP/for_reduction_codegen.cpp
@@ -52,6 +52,8 @@ T tmain() {
   return T();
 }
 
+extern S<float> **foo();
+
 int main() {
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global double
@@ -182,6 +184,9 @@ int main() {
   S<float> s_arr[] = {1, 2};
   S<float> &var = test;
   S<float> var1, arrs[10][4];
+  S<float> **var2 = foo();
+  S<float> vvar2[2];
+  S<float> (&var3)[2] = s_arr;
 #pragma omp parallel
 #pragma omp for reduction(+:t_var) reduction(&:var) reduction(&& : var1) reduction(min: t_var1)
   for (int i = 0; i < 2; ++i) {
@@ -192,6 +197,26 @@ int main() {
 #pragma omp parallel for reduction(+:arr[1][:vec[1]]) reduction(&:arrs[1:vec[1]][1:2])
   for (int i = 0; i < 10; ++i)
     ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(+:arr) reduction(&:arrs)
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(& : var2[0 : 5][1 : 6])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : vvar2[0 : 5])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3[1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3)
+  for (int i = 0; i < 10; ++i)
+    ;
   return tmain<int>();
 #endif
 }
@@ -201,6 +226,11 @@ int main() {
 // CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i32]*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK:@.+]] to void
 // CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [2 x i32]*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK2:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[S_FLOAT_TY]]***)* [[MAIN_MICROTASK3:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK4:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK5:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK6:@.+]] to void
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
@@ -666,6 +696,316 @@ int main() {
 
 // CHECK: ret void
 
+// CHECK: define internal void [[MAIN_MICROTASK2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(160) %{{.+}})
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [10 x [4 x [[S_FLOAT_TY]]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [3 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 0, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// Check initialization of private copy.
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfEC1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+// CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV_BEGIN]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [3 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 24, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0:%.+]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = add nsw i32 %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: atomicrmw add i32* %{{.+}}, i32 %{{.+}} monotonic
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: br
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[BEGIN]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+// CHECK: call void @__kmpc_barrier(
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = add nsw i32 %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: [[AND:%.+]] = call dereferenceable(4) [[S_FLOAT_TY]]* @_ZN1SIfEanERKS0_([[S_FLOAT_TY]]* %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[AND]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.+}}, i8* [[BITCAST]], i64 4, i32 4, i1 false)
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[S_FLOAT_TY]]*** dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR2_ORIG_ADDR:%.+]] = alloca [[S_FLOAT_TY]]***,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
+
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: store [[S_FLOAT_TY]]** [[REF:.+]], [[S_FLOAT_TY]]*** %
+// CHECK: store [[S_FLOAT_TY]]* [[PSEUDO_VAR2_PRIV]], [[S_FLOAT_TY]]** [[REF]]
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK4]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VVAR2_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
+
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VVAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VVAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VVAR2_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VVAR2_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK5]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint (float* getelementptr (float, float* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR3_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR3_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VAR3_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VAR3_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK6]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
+// CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
diff --git a/test/OpenMP/for_reduction_codegen_UDR.cpp b/test/OpenMP/for_reduction_codegen_UDR.cpp
new file mode 100644
index 0000000000000..a30df368663db
--- /dev/null
+++ b/test/OpenMP/for_reduction_codegen_UDR.cpp
@@ -0,0 +1,984 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+volatile double g, g_orig;
+volatile double &g1 = g_orig;
+
+struct BaseS {
+  int x;
+};
+struct BaseS1 {
+  float y;
+};
+
+template <class T>
+struct S : public BaseS, public BaseS1 {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  ~S() {}
+};
+void red(BaseS1&, const BaseS1&);
+void red_plus(BaseS1&, const BaseS1&);
+void init(BaseS1&, const BaseS1&);
+void init1(BaseS1&, const BaseS1&);
+void init2(BaseS1&, const BaseS1&);
+void init_plus(BaseS1&, const BaseS1&);
+#pragma omp declare reduction(operator& : BaseS1 : red(omp_out, omp_in)) initializer(init(omp_priv, omp_orig))
+#pragma omp declare reduction(+ : BaseS1 : red_plus(omp_out, omp_in)) initializer(init_plus(omp_priv, omp_orig))
+#pragma omp declare reduction(&& : S<float>, S<int> : omp_out.f *= omp_in.f) initializer(init1(omp_priv, omp_orig))
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { %{{[^,]+}}, %{{[^,]+}}, i{{[0-9]+}} }
+// CHECK-DAG: [[ATOMIC_REDUCE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
+// CHECK-DAG: [[REDUCTION_LOCK:@.+]] = common global [8 x i32] zeroinitializer
+
+#pragma omp declare reduction(operator&& : int : omp_out = 111 & omp_in)
+template <typename T>
+T tmain() {
+  T t;
+  S<T> test;
+  T t_var = T(), t_var1;
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> &var = test;
+  S<T> var1;
+#pragma omp declare reduction(operator& : T : omp_out = 15 + omp_in)
+#pragma omp declare reduction(operator+ : T : omp_out = 1513 + omp_in) initializer(omp_priv = 321)
+#pragma omp declare reduction(min : T : omp_out = 47 - omp_in) initializer(omp_priv = 432 / omp_orig)
+#pragma omp declare reduction(operator&& : S<T> : omp_out.f = 17 * omp_in.f) initializer(init2(omp_priv, omp_orig))
+#pragma omp declare reduction(operator&& : T : omp_out = 17 * omp_in)
+#pragma omp parallel
+#pragma omp for reduction(+ : t_var) reduction(& : var) reduction(&& : var1) reduction(min : t_var1) nowait
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+#pragma omp parallel
+#pragma omp for reduction(&& : t_var)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  return T();
+}
+
+extern S<float> **foo();
+
+#pragma omp declare reduction(operator- : float, double : omp_out = 333 + omp_in)
+#pragma omp declare reduction(min : float, double : omp_out = 555 + omp_in)
+int main() {
+#pragma omp declare reduction(operator+ : float, double : omp_out = 222 - omp_in) initializer(omp_priv = -1)
+  S<float> test;
+  float t_var = 0, t_var1;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> &var = test;
+  S<float> var1, arrs[10][4];
+  S<float> **var2 = foo();
+  S<float> vvar2[2];
+  S<float>(&var3)[2] = s_arr;
+#pragma omp declare reduction(operator+ : int : omp_out = 555 * omp_in) initializer(omp_priv = 888)
+#pragma omp parallel
+#pragma omp for reduction(+ : t_var) reduction(& : var) reduction(&& : var1) reduction(min : t_var1)
+  for (int i = 0; i < 2; ++i) {
+    vec[i] = t_var;
+    s_arr[i] = var;
+  }
+  int arr[10][vec[1]];
+#pragma omp parallel for reduction(+ : arr[1][ : vec[1]]) reduction(& : arrs[1 : vec[1]][1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(+ : arr) reduction(& : arrs)
+  for (int i = 0; i < 10; ++i)
+    ++arr[1][i];
+#pragma omp parallel
+#pragma omp for reduction(& : var2[0 : 5][1 : 6])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : vvar2[0 : 5])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3[1 : 2])
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+#pragma omp for reduction(& : var3)
+  for (int i = 0; i < 10; ++i)
+    ;
+  return tmain<int>();
+}
+
+// CHECK: define {{.*}}i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, float*, [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]*, float*, [2 x i32]*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [2 x i32]*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK1:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, i64, i32*, [10 x [4 x [[S_FLOAT_TY]]]]*)* [[MAIN_MICROTASK2:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[S_FLOAT_TY]]***)* [[MAIN_MICROTASK3:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK4:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK5:@.+]] to void
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x [[S_FLOAT_TY]]]*)* [[MAIN_MICROTASK6:@.+]] to void
+// CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, float* dereferenceable(4) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(12) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(12) %{{.+}}, float* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %vec, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca float,
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca float,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_REF:%.+]] = load float*, float** %
+// CHECK: [[VAR1_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[T_VAR1_REF:%.+]] = load float*, float** %
+
+// For + reduction operation initial value of private variable is -1.
+// CHECK: store float -1.0{{.+}}, float*
+
+// For & reduction operation initial value of private variable is defined by call of 'init()' function.
+// CHECK: call {{.*}}void @_Z4initR6BaseS1RKS_(
+
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call {{.*}}void @_Z5init1R6BaseS1RKS_(
+
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: [[INIT:%.+]] = load float, float* @
+// CHECK: store float [[INIT]], float* [[T_VAR1_PRIV]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = bitcast float* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: fsub float 2.220000e+02, %
+
+// var = var.operator &(var_reduction);
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: fmul float
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: fadd float 5.550000e+02, %
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: call void @__kmpc_critical(
+// CHECK: fsub float 2.220000e+02, %
+// CHECK: call void @__kmpc_end_critical(
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: fmul float
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: fadd float 5.550000e+02, %
+// CHECK: call void @__kmpc_end_critical(
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (float*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to float*
+// t_var_rhs = (float*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to float*
+
+// var_lhs = (S<float>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var_rhs = (S<float>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// var1_lhs = (S<float>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_FLOAT_TY]]*
+// var1_rhs = (S<float>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// t_var1_lhs = (float*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to float*
+// t_var1_rhs = (float*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to float*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: fsub float 2.220000e+02, %
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: call {{.*}}void @_Z3redR6BaseS1RKS_(
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: fmul float
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: fadd float 5.550000e+02, %
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK1]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(480) %{{.+}})
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
+// CHECK: [[LB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[LB1_0:%.+]] = getelementptr inbounds i32, i32* [[LB1]], i64 0
+// CHECK: [[IDX1:%.+]] = mul nsw i64 1, %{{.+}}
+// CHECK: [[UB1:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[UB1_UP:%.+]] = getelementptr inbounds i32, i32* [[UB1]], i64 %
+// CHECK: [[UB_CAST:%.+]] = ptrtoint i32* [[UB1_UP]] to i64
+// CHECK: [[LB_CAST:%.+]] = ptrtoint i32* [[LB1_0]] to i64
+// CHECK: [[DIFF:%.+]] = sub i64 [[UB_CAST]], [[LB_CAST]]
+// CHECK: [[SIZE_1:%.+]] = sdiv exact i64 [[DIFF]], ptrtoint (i32* getelementptr (i32, i32* null, i32 1) to i64)
+// CHECK: [[ARR_SIZE:%.+]] = add nuw i64 [[SIZE_1]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 888, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[ARRS_SIZE:%.+]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_PRIV]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z4initR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+// CHECK: [[ARRS_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARRS_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_SIZE_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_PRIV]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[ARRS_PRIV]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arrs_size = (size_t)lhs[3];
+// CHECK: [[ARRS_SIZE_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[ARRS_SIZE_VOID:%.+]] = load i8*, i8** [[ARRS_SIZE_REF]],
+// CHECK: [[ARRS_SIZE:%.+]] = ptrtoint i8* [[ARRS_SIZE_VOID]] to i64
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 [[ARRS_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 %{{.+}}, i64 %{{.+}}, i32* nonnull %{{.+}}, [10 x [4 x [[S_FLOAT_TY]]]]* dereferenceable(480) %{{.+}})
+
+// CHECK: [[ARRS_PRIV:%.+]] = alloca [10 x [4 x [[S_FLOAT_TY]]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [3 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[ARR_SIZE:%.+]] = mul nuw i64 %{{.+}}, 4
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[ARR_PRIV:%.+]] = alloca i32, i64 [[ARR_SIZE]],
+
+// Check initialization of private copy.
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_PRIV]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_PRIV]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: store i32 888, i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// Check initialization of private copy.
+// CHECK: [[LHS_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* %{{.+}} to [[S_FLOAT_TY]]*
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z4initR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+// CHECK: [[ARRS_PRIV_BEGIN:%.+]] = bitcast [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]] to [[S_FLOAT_TY]]*
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[ARR_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i32* [[ARR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_PRIV_REF]],
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = inttoptr i64 [[ARR_SIZE]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARRS_PRIV_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_FLOAT_TY]]* [[ARRS_PRIV_BEGIN]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[ARRS_PRIV_REF]],
+
+// res = __kmpc_reduce(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: [[BITCAST:%.+]] = bitcast [3 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 2, i64 24, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// CHECK: [[CASE1]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0:%.+]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%[^ ]+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// __kmpc_end_reduce(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// CHECK: [[CASE2]]
+
+// arr[:] += arr_reduction[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[LB1_0]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[LB1_0]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: call void @__kmpc_critical(
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs[:] = var.operator &(arrs_reduction[:]);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[LHS_BEGIN]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[LHS_BEGIN]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+
+// Check destruction of private copy.
+// CHECK: [[BEGIN:%.+]] = getelementptr inbounds [10 x [4 x [[S_FLOAT_TY]]]], [10 x [4 x [[S_FLOAT_TY]]]]* [[ARRS_PRIV]], i32 0, i32 0, i32 0
+// CHECK: [[END:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[BEGIN]], i64 40
+// CHECK: br
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_ZN1SIfED1Ev([[S_FLOAT_TY]]* %
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[BEGIN]]
+// CHECK: br i1 [[DONE]],
+// CHECK: call void @llvm.stackrestore(i8*
+// CHECK: call void @__kmpc_barrier(
+
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// arr_rhs = (int*)rhs[0];
+// CHECK: [[ARR_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_RHS_VOID:%.+]] = load i8*, i8** [[ARR_RHS_REF]],
+// CHECK: [[ARR_RHS:%.+]] = bitcast i8* [[ARR_RHS_VOID]] to i32*
+// arr_lhs = (int*)lhs[0];
+// CHECK: [[ARR_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[ARR_LHS_VOID:%.+]] = load i8*, i8** [[ARR_LHS_REF]],
+// CHECK: [[ARR_LHS:%.+]] = bitcast i8* [[ARR_LHS_VOID]] to i32*
+
+// arr_size = (size_t)lhs[1];
+// CHECK: [[ARR_SIZE_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[ARR_SIZE_VOID:%.+]] = load i8*, i8** [[ARR_SIZE_REF]],
+// CHECK: [[ARR_SIZE:%.+]] = ptrtoint i8* [[ARR_SIZE_VOID]] to i64
+
+// arrs_rhs = (S<float>*)rhs[2];
+// CHECK: [[ARRS_RHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[ARRS_RHS_VOID:%.+]] = load i8*, i8** [[ARRS_RHS_REF]],
+// CHECK: [[ARRS_RHS:%.+]] = bitcast i8* [[ARRS_RHS_VOID]] to [[S_FLOAT_TY]]*
+// arrs_lhs = (S<float>*)lhs[2];
+// CHECK: [[ARRS_LHS_REF:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[ARRS_LHS_VOID:%.+]] = load i8*, i8** [[ARRS_LHS_REF]],
+// CHECK: [[ARRS_LHS:%.+]] = bitcast i8* [[ARRS_LHS_VOID]] to [[S_FLOAT_TY]]*
+
+// arr_lhs[:] += arr_rhs[:];
+// CHECK: [[END:%.+]] = getelementptr i32, i32* [[ARR_LHS]], i64 [[ARR_SIZE]]
+// CHECK: [[ISEMPTY:%.+]] = icmp eq i32* [[ARR_LHS]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi i32*
+// CHECK: [[ADD:%.+]] = mul nsw i32 555, %
+// CHECK: store i32 [[ADD]], i32* %
+// CHECK: [[DONE:%.+]] = icmp eq i32* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// arrs_lhs = arrs_lhs.operator &(arrs_rhs);
+// CHECK: [[END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[ARRS_LB:%.+]], i64 40
+// CHECK: [[ISEMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[ARRS_LB]], [[END]]
+// CHECK: br i1 [[ISEMPTY]],
+// CHECK: phi [[S_FLOAT_TY]]*
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: [[DONE:%.+]] = icmp eq [[S_FLOAT_TY]]* %{{.+}}, [[END]]
+// CHECK: br i1 [[DONE]],
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[S_FLOAT_TY]]*** dereferenceable(8) %{{.+}})
+
+// CHECK: [[VAR2_ORIG_ADDR:%.+]] = alloca [[S_FLOAT_TY]]***,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VAR2_ORIG:%.+]] = load [[S_FLOAT_TY]]***, [[S_FLOAT_TY]]**** [[VAR2_ORIG_ADDR]],
+
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 0
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 1
+// CHECK: load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %{{.+}}, i64 4
+// CHECK: load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK: getelementptr inbounds [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 6
+// CHECK: [[LD:%.+]] = load [[S_FLOAT_TY]]**, [[S_FLOAT_TY]]*** [[VAR2_ORIG]],
+// CHECK: [[ORIG_START:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** [[LD]],
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: store [[S_FLOAT_TY]]** [[REF:.+]], [[S_FLOAT_TY]]*** %
+// CHECK: store [[S_FLOAT_TY]]* [[PSEUDO_VAR2_PRIV]], [[S_FLOAT_TY]]** [[REF]]
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK4]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VVAR2_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+// CHECK: [[VVAR2_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VVAR2_ORIG_ADDR]],
+
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 0
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]], i64 0, i64 4
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VVAR2_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VVAR2_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VVAR2_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VVAR2_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VVAR2_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VVAR2_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK5]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [2 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[LOW:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 1
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], i64 0, i64 2
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: [[ORIG_START:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: [[LAST:%.+]] = ptrtoint [[S_FLOAT_TY]]* %{{.+}} to i64
+// CHECK: [[FIRST:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[BYTE_DIF:%.+]] = sub i64 [[LAST]], [[FIRST]]
+// CHECK: [[DIF:%.+]] = sdiv exact i64 [[BYTE_DIF]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[SIZE:%.+]] = add nuw i64 [[DIF]], 1
+// CHECK: call i8* @llvm.stacksave()
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [[S_FLOAT_TY]], i64 [[SIZE]],
+// CHECK: [[START:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[ORIG_START]] to i64
+// CHECK: [[LOW_BOUND:%.+]] = ptrtoint [[S_FLOAT_TY]]* [[LOW]] to i64
+// CHECK: [[OFFSET_BYTES:%.+]] = sub i64 [[START]], [[LOW_BOUND]]
+// CHECK: [[OFFSET:%.+]] = sdiv exact i64 [[OFFSET_BYTES]], ptrtoint ([[S_FLOAT_TY]]* getelementptr ([[S_FLOAT_TY]], [[S_FLOAT_TY]]* null, i32 1) to i64)
+// CHECK: [[PSEUDO_VAR3_PRIV:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[VAR3_PRIV]], i64 [[OFFSET]]
+// CHECK: [[VAR3_PRIV:%.+]] = bitcast [[S_FLOAT_TY]]* [[PSEUDO_VAR3_PRIV]] to [2 x [[S_FLOAT_TY]]]*
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define internal void [[MAIN_MICROTASK6]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(24) %{{.+}})
+
+// CHECK: [[VAR3_ORIG_ADDR:%.+]] = alloca [2 x [[S_FLOAT_TY]]]*,
+// CHECK: [[VAR3_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [1 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]], [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR:%.+]],
+// CHECK: [[VAR3_ORIG:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** [[VAR3_ORIG_ADDR]],
+// CHECK: bitcast [2 x [[S_FLOAT_TY]]]* [[VAR3_ORIG]] to [[S_FLOAT_TY]]*
+// CHECK: getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], i32 0, i32 0
+// CHECK: getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* %{{.+}}, i64 2
+
+// CHECK: store [2 x [[S_FLOAT_TY]]]* [[VAR3_PRIV]], [2 x [[S_FLOAT_TY]]]** %
+
+// CHECK: ret void
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 6, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [[S_INT_TY]]*, [[S_INT_TY]]*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK: ret
+//
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[S_INT_TY]]* dereferenceable(12) %{{.+}}, [[S_INT_TY]]* dereferenceable(12) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(24) %{{.+}})
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[VAR1_PRIV:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR1_PRIV:%.+]] = alloca i{{[0-9]+}},
+
+// Reduction list for runtime.
+// CHECK: [[RED_LIST:%.+]] = alloca [4 x i8*],
+
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[VAR1_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
+// CHECK: [[T_VAR1_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+
+// For + reduction operation initial value of private variable is 0.
+// CHECK: store i32 321, i32* %
+
+// For & reduction operation initial value of private variable is ones in all bits.
+// CHECK: call void @_Z4initR6BaseS1RKS_(
+
+// For && reduction operation initial value of private variable is 1.0.
+// CHECK: call void @_Z5init2R6BaseS1RKS_(
+
+// For min reduction operation initial value of private variable is largest repesentable value.
+// CHECK: sdiv i32 432, %
+
+// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
+// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
+// CHECK: call void @__kmpc_for_static_init_4(
+// Skip checks for internal operations.
+// CHECK: call void @__kmpc_for_static_fini(
+
+// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
+
+// CHECK: [[T_VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 0
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR_PRIV_REF]],
+// CHECK: [[VAR_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 1
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR_PRIV_REF]],
+// CHECK: [[VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 2
+// CHECK: [[BITCAST:%.+]] = bitcast [[S_INT_TY]]* [[VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[VAR1_PRIV_REF]],
+// CHECK: [[T_VAR1_PRIV_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST]], i64 0, i64 3
+// CHECK: [[BITCAST:%.+]] = bitcast i{{[0-9]+}}* [[T_VAR1_PRIV]] to i8*
+// CHECK: store i8* [[BITCAST]], i8** [[T_VAR1_PRIV_REF]],
+
+// res = __kmpc_reduce_nowait(<loc>, <gtid>, <n>, sizeof(RedList), RedList, reduce_func, &<lock>);
+
+// CHECK: [[BITCAST:%.+]] = bitcast [4 x i8*]* [[RED_LIST]] to i8*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], i32 4, i64 32, i8* [[BITCAST]], void (i8*, i8*)* [[REDUCTION_FUNC:@.+]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// switch(res)
+// CHECK: switch i32 [[RES]], label %[[RED_DONE:.+]] [
+// CHECK: i32 1, label %[[CASE1:.+]]
+// CHECK: i32 2, label %[[CASE2:.+]]
+// CHECK: ]
+
+// case 1:
+// t_var += t_var_reduction;
+// CHECK: add nsw i32 1513, %
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: mul nsw i32 17, %
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: sub nsw i32 47, %
+
+// __kmpc_end_reduce_nowait(<loc>, <gtid>, &<lock>);
+// CHECK: call void @__kmpc_end_reduce_nowait(%{{.+}}* [[REDUCTION_LOC]], i32 [[GTID]], [8 x i32]* [[REDUCTION_LOCK]])
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+
+// case 2:
+// t_var += t_var_reduction;
+// CHECK: call void @__kmpc_critical(
+// CHECK: add nsw i32 1513, %
+// CHECK: call void @__kmpc_end_critical(
+
+// var = var.operator &(var_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+// CHECK: call void @__kmpc_end_critical(
+
+// var1 = var1.operator &&(var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: mul nsw i32 17, %
+// CHECK: call void @__kmpc_end_critical(
+
+// t_var1 = min(t_var1, t_var1_reduction);
+// CHECK: call void @__kmpc_critical(
+// CHECK: sub nsw i32 47, %
+// CHECK: call void @__kmpc_end_critical(
+
+// break;
+// CHECK: br label %[[RED_DONE]]
+// CHECK: [[RED_DONE]]
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// void reduce_func(void *lhs[<n>], void *rhs[<n>]) {
+//  *(Type0*)lhs[0] = ReductionOperation0(*(Type0*)lhs[0], *(Type0*)rhs[0]);
+//  ...
+//  *(Type<n>-1*)lhs[<n>-1] = ReductionOperation<n>-1(*(Type<n>-1*)lhs[<n>-1],
+//  *(Type<n>-1*)rhs[<n>-1]);
+// }
+// CHECK: define internal void [[REDUCTION_FUNC]](i8*, i8*)
+// t_var_lhs = (i{{[0-9]+}}*)lhs[0];
+// CHECK: [[T_VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR_RHS_REF]],
+// CHECK: [[T_VAR_RHS:%.+]] = bitcast i8* [[T_VAR_RHS_VOID]] to i{{[0-9]+}}*
+// t_var_rhs = (i{{[0-9]+}}*)rhs[0];
+// CHECK: [[T_VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS:%.+]], i64 0, i64 0
+// CHECK: [[T_VAR_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR_LHS_REF]],
+// CHECK: [[T_VAR_LHS:%.+]] = bitcast i8* [[T_VAR_LHS_VOID]] to i{{[0-9]+}}*
+
+// var_lhs = (S<i{{[0-9]+}}>*)lhs[1];
+// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 1
+// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]],
+// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to [[S_INT_TY]]*
+// var_rhs = (S<i{{[0-9]+}}>*)rhs[1];
+// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 1
+// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]],
+// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to [[S_INT_TY]]*
+
+// var1_lhs = (S<i{{[0-9]+}}>*)lhs[2];
+// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 2
+// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]],
+// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to [[S_INT_TY]]*
+// var1_rhs = (S<i{{[0-9]+}}>*)rhs[2];
+// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 2
+// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]],
+// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to [[S_INT_TY]]*
+
+// t_var1_lhs = (i{{[0-9]+}}*)lhs[3];
+// CHECK: [[T_VAR1_RHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_RHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_RHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_RHS_REF]],
+// CHECK: [[T_VAR1_RHS:%.+]] = bitcast i8* [[T_VAR1_RHS_VOID]] to i{{[0-9]+}}*
+// t_var1_rhs = (i{{[0-9]+}}*)rhs[3];
+// CHECK: [[T_VAR1_LHS_REF:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[RED_LIST_LHS]], i64 0, i64 3
+// CHECK: [[T_VAR1_LHS_VOID:%.+]] = load i8*, i8** [[T_VAR1_LHS_REF]],
+// CHECK: [[T_VAR1_LHS:%.+]] = bitcast i8* [[T_VAR1_LHS_VOID]] to i{{[0-9]+}}*
+
+// t_var_lhs += t_var_rhs;
+// CHECK: add nsw i32 1513, %
+
+// var_lhs = var_lhs.operator &(var_rhs);
+// CHECK: call void @_Z3redR6BaseS1RKS_(%
+
+// var1_lhs = var1_lhs.operator &&(var1_rhs);
+// CHECK: mul nsw i32 17, %
+
+// t_var1_lhs = min(t_var1_lhs, t_var1_rhs);
+// CHECK: sub nsw i32 47, %
+// CHECK: ret void
+
+#endif
+
diff --git a/test/OpenMP/for_reduction_messages.cpp b/test/OpenMP/for_reduction_messages.cpp
index 317f88ce11ba9..45a4681440fc2 100644
--- a/test/OpenMP/for_reduction_messages.cpp
+++ b/test/OpenMP/for_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -33,7 +33,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -55,9 +55,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable: no known conversion from 'int' to 'S6' for 1st argument}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable: no known conversion from 'int' to 'S6' for 1st argument}}
 #endif
   int a;
 
@@ -123,7 +123,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp for reduction(foo : argc) // expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -135,11 +135,11 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -147,15 +147,15 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -175,7 +175,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -187,7 +187,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -304,15 +304,15 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_aligned_messages.cpp b/test/OpenMP/for_simd_aligned_messages.cpp
index 1007b3c545cc0..cef83c30e239c 100644
--- a/test/OpenMP/for_simd_aligned_messages.cpp
+++ b/test/OpenMP/for_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@ int main(int argc, char **argv) {
   #pragma omp for simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/for_simd_ast_print.cpp b/test/OpenMP/for_simd_ast_print.cpp
index d4b13ba1998c8..54f0d4676134d 100644
--- a/test/OpenMP/for_simd_ast_print.cpp
+++ b/test/OpenMP/for_simd_ast_print.cpp
@@ -6,6 +6,57 @@
 #ifndef HEADER
 #define HEADER
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp for simd private(a) private(this->a) private(S7<S1>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp for simd private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp for simd private(this->a) private(this->a)
+
 void foo() {}
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
diff --git a/test/OpenMP/for_simd_codegen.cpp b/test/OpenMP/for_simd_codegen.cpp
index e1aa8926418f8..fb282d0511bb5 100644
--- a/test/OpenMP/for_simd_codegen.cpp
+++ b/test/OpenMP/for_simd_codegen.cpp
@@ -54,13 +54,13 @@ void simple(float *a, float *b, float *c, float *d) {
 
   long long k = get_val();
 
-  #pragma omp for simd linear(k : 3) schedule(dynamic)
+  #pragma omp for simd linear(k : 3) schedule(simd, nonmonotonic: dynamic)
 // CHECK: [[K0:%.+]] = call {{.*}}i64 @{{.*}}get_val
 // CHECK-NEXT: store i64 [[K0]], i64* [[K_VAR:%[^,]+]]
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_VAR]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
-// CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{.+}}, i32 %{{.+}}, i32 35, i32 0, i32 8, i32 1, i32 1)
+// CHECK: call void @__kmpc_dispatch_init_4(%ident_t* {{.+}}, i32 %{{.+}}, i32 1073741859, i32 0, i32 8, i32 1, i32 1)
 // CHECK: [[NEXT:%.+]] = call i32 @__kmpc_dispatch_next_4(%ident_t* {{.+}}, i32 %{{.+}}, i32* %{{.+}}, i32* [[LB:%.+]], i32* [[UB:%.+]], i32* %{{.+}})
 // CHECK: [[COND:%.+]] = icmp ne i32 [[NEXT]], 0
 // CHECK: br i1 [[COND]], label %[[CONT:.+]], label %[[END:.+]]
@@ -362,7 +362,7 @@ template <class T, unsigned K> T tfoo(T a) { return a + K; }
 
 template <typename T, unsigned N>
 int templ1(T a, T *z) {
-  #pragma omp for simd collapse(N)
+  #pragma omp for simd collapse(N) schedule(simd: static, N)
   for (int i = 0; i < N * 2; i++) {
     for (long long j = 0; j < (N + N + N + N); j += 2) {
       z[i + j] = a + tfoo<T, N>(i + j);
@@ -373,7 +373,7 @@ int templ1(T a, T *z) {
 
 // Instatiation templ1<float,2>
 // CHECK-LABEL: define {{.*i32}} @{{.*}}templ1{{.*}}(float {{.+}}, float* {{.+}})
-// CHECK: call void @__kmpc_for_static_init_8(%ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 34, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 1)
+// CHECK: call void @__kmpc_for_static_init_8(%ident_t* {{[^,]+}}, i32 %{{[^,]+}}, i32 45, i32* %{{[^,]+}}, i64* [[LB:%[^,]+]], i64* [[UB:%[^,]+]], i64* [[STRIDE:%[^,]+]], i64 1, i64 2)
 // CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
 // CHECK: [[CMP:%.+]] = icmp sgt i64 [[UB_VAL]], 15
 // CHECK: br i1 [[CMP]], label %[[TRUE:.+]], label %[[FALSE:[^,]+]]
@@ -389,6 +389,7 @@ int templ1(T a, T *z) {
 // CHECK: store i64 [[LB_VAL]], i64* [[T1_OMP_IV:%[^,]+]],
 
 // ...
+// CHECK: icmp sle i64
 // CHECK: [[IV:%.+]] = load i64, i64* [[T1_OMP_IV]]
 // CHECK-NEXT: [[UB_VAL:%.+]] = load i64, i64* [[UB]]
 // CHECK-NEXT: [[CMP1:%.+]] = icmp sle i64 [[IV]], [[UB_VAL]]
@@ -581,9 +582,11 @@ void collapsed(float *a, float *b, float *c, float *d) {
   }
 // i,j,l are updated; k is not updated.
 // CHECK: call void @__kmpc_for_static_fini(%ident_t* {{.+}}, i32 %{{.+}})
-// CHECK-NEXT: store i32 3, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i32 5, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i16 9, i16* [[I:%[^,]+]]
+// CHECK: br i1
+// CHECK: store i32 3, i32*
+// CHECK-NEXT: store i32 5,
+// CHECK-NEXT: store i32 7,
+// CHECK-NEXT: store i16 9, i16*
 // CHECK: call void @__kmpc_barrier(%ident_t* {{.+}}, i32 %{{.+}})
 // CHECK: ret void
 }
diff --git a/test/OpenMP/for_simd_collapse_messages.cpp b/test/OpenMP/for_simd_collapse_messages.cpp
index 5c9d058b975c6..2efd49479ea9f 100644
--- a/test/OpenMP/for_simd_collapse_messages.cpp
+++ b/test/OpenMP/for_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp for simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for simd', but found only 1}}
   #pragma omp for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp for simd', but found only 1}}
-  #pragma omp for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/for_simd_firstprivate_messages.cpp b/test/OpenMP/for_simd_firstprivate_messages.cpp
index cb74ee081710c..4e96866c1a2c2 100644
--- a/test/OpenMP/for_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/for_simd_firstprivate_messages.cpp
@@ -147,7 +147,7 @@ int foomain(int argc, char **argv) {
     foo();
 #pragma omp parallel reduction(+ : i) // expected-note {{defined as reduction}}
 #pragma omp for simd firstprivate(i)       // expected-error {{firstprivate variable must be shared}}
-  for (i = 0; i < argc; ++i)
+  for (int k = 0; k < argc; ++k)
     foo();
   return 0;
 }
diff --git a/test/OpenMP/for_simd_linear_messages.cpp b/test/OpenMP/for_simd_linear_messages.cpp
index 44370a15664fb..3f93125a38a29 100644
--- a/test/OpenMP/for_simd_linear_messages.cpp
+++ b/test/OpenMP/for_simd_linear_messages.cpp
@@ -208,7 +208,7 @@ int main(int argc, char **argv) {
   #pragma omp for simd linear(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/for_simd_loop_messages.cpp b/test/OpenMP/for_simd_loop_messages.cpp
index afd7b0bb545e5..e9729a8fc26d8 100644
--- a/test/OpenMP/for_simd_loop_messages.cpp
+++ b/test/OpenMP/for_simd_loop_messages.cpp
@@ -408,12 +408,12 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -465,7 +465,7 @@ int test_with_random_access_iterator() {
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
 #pragma omp parallel
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp for simd
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/for_simd_private_messages.cpp b/test/OpenMP/for_simd_private_messages.cpp
index 15e235c2f8f41..ca4c3a33fcaff 100644
--- a/test/OpenMP/for_simd_private_messages.cpp
+++ b/test/OpenMP/for_simd_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp for simd private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     m = k + 2;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/for_simd_reduction_messages.cpp b/test/OpenMP/for_simd_reduction_messages.cpp
index 000960fb55b53..2935cec602dd2 100644
--- a/test/OpenMP/for_simd_reduction_messages.cpp
+++ b/test/OpenMP/for_simd_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -122,7 +122,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -134,11 +134,11 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -146,15 +146,15 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -174,7 +174,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -186,7 +186,7 @@ T tmain(T argc) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
@@ -295,15 +295,15 @@ int main(int argc, char **argv) {
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
-#pragma omp for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel
diff --git a/test/OpenMP/for_simd_safelen_messages.cpp b/test/OpenMP/for_simd_safelen_messages.cpp
index d70e90198ad6e..fa1b4442cee49 100644
--- a/test/OpenMP/for_simd_safelen_messages.cpp
+++ b/test/OpenMP/for_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd safelen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/for_simd_simdlen_messages.cpp b/test/OpenMP/for_simd_simdlen_messages.cpp
index c72e546811924..8fe197979cc27 100644
--- a/test/OpenMP/for_simd_simdlen_messages.cpp
+++ b/test/OpenMP/for_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp for simd simdlen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp for simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp for simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp for simd' must be a for loop}}
diff --git a/test/OpenMP/loops_explicit_clauses_codegen.cpp b/test/OpenMP/loops_explicit_clauses_codegen.cpp
new file mode 100644
index 0000000000000..dc21fd11af6f9
--- /dev/null
+++ b/test/OpenMP/loops_explicit_clauses_codegen.cpp
@@ -0,0 +1,162 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// expected-no-diagnostics
+
+
+#ifndef HEADER
+#define HEADER
+
+#define N 10
+int foo();
+int bar();
+int k;
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: @k
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-NOT: @k
+#pragma omp for private(k)
+  for (k = 0; k < argc; k++)
+    ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_8(
+// CHECK-NOT: @k
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK: store i32 %{{.+}}, i32* @k
+#pragma omp for lastprivate(k) collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+      ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd linear(k : 2)
+  for (k = 0; k < argc; k++)
+    bar();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+  foo();
+#pragma omp simd lastprivate(k) collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+     bar() ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd
+  for (k = 0; k < argc; k++)
+    bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: @k{{.+}}!llvm.mem.parallel_loop_access
+// CHECK: sdiv i32
+// CHECK: store i32 %{{.+}}, i32* @k,
+#pragma omp simd collapse(2)
+  for (int i = 0; i < 2; ++i)
+    for (k = 0; k < argc; k++)
+      bar();
+// CHECK: @{{.+}}foo
+  foo();
+  return 0;
+}
+
+struct S {
+  int k;
+  S(int argc) {
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_4(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: call void @__kmpc_for_static_fini(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp for private(k)
+    for (k = 0; k < argc; k++)
+      ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: call void @__kmpc_for_static_init_8(
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: call void @__kmpc_for_static_fini(
+#pragma omp for lastprivate(k) collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        ;
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd linear(k : 2)
+    for (k = 0; k < argc; k++)
+      bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i64 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd lastprivate(k) collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd
+    for (k = 0; k < argc; k++)
+      bar();
+  foo();
+// CHECK: @{{.+}}foo
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: br i1
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: i32 @{{.+}}bar{{.+}}!llvm.mem.parallel_loop_access
+// CHECK-NOT: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+// CHECK: add nsw i64 %{{.+}}, 1
+// CHECK: br label {{.+}}, !llvm.loop
+// CHECK: getelementptr inbounds %struct.S, %struct.S* %{{.+}}, i32 0, i32 0
+#pragma omp simd collapse(2)
+    for (int i = 0; i < 2; ++i)
+      for (k = 0; k < argc; k++)
+        bar();
+// CHECK: @{{.+}}foo
+  foo();
+  }
+} s(N);
+
+#endif // HEADER
diff --git a/test/OpenMP/nesting_of_regions.cpp b/test/OpenMP/nesting_of_regions.cpp
index b2b87db6a158a..b6082e7447ddc 100644
--- a/test/OpenMP/nesting_of_regions.cpp
+++ b/test/OpenMP/nesting_of_regions.cpp
@@ -97,6 +97,27 @@ void foo() {
   }
 #pragma omp parallel
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+  }
+#pragma omp parallel
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -112,7 +133,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
-
+#pragma omp parallel
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+  
 // SIMD DIRECTIVE
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
@@ -122,7 +165,7 @@ void foo() {
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -226,6 +269,16 @@ void foo() {
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp ordered simd // OK
+    bar();
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered threads // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -236,6 +289,27 @@ void foo() {
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -251,6 +325,28 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR DIRECTIVE
 #pragma omp for
@@ -398,6 +494,27 @@ void foo() {
   }
 #pragma omp for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -413,6 +530,28 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR SIMD DIRECTIVE
 #pragma omp for simd
@@ -423,7 +562,7 @@ void foo() {
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -527,6 +666,16 @@ void foo() {
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp ordered simd // OK
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered threads // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -537,6 +686,27 @@ void foo() {
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -552,6 +722,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // SECTIONS DIRECTIVE
 #pragma omp sections
@@ -706,6 +899,25 @@ void foo() {
   }
 #pragma omp sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -721,6 +933,28 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SECTION DIRECTIVE
 #pragma omp section // expected-error {{orphaned 'omp section' directives are prohibited, it must be closely nested to a sections region}}
@@ -911,6 +1145,37 @@ void foo() {
 #pragma omp sections
   {
 #pragma omp section
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target parallel for
+      for (int i = 0; i < 10; ++i)
+        ;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target enter data map(to: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target exit data map(from: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -928,6 +1193,35 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+      bar();
+#pragma omp target update to(a)
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SINGLE DIRECTIVE
 #pragma omp single
@@ -1065,6 +1359,27 @@ void foo() {
   }
 #pragma omp single
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp single
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1080,6 +1395,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp single
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // MASTER DIRECTIVE
 #pragma omp master
@@ -1217,6 +1555,27 @@ void foo() {
   }
 #pragma omp master
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp master
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1232,6 +1591,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp master
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // CRITICAL DIRECTIVE
 #pragma omp critical
@@ -1383,6 +1765,27 @@ void foo() {
   }
 #pragma omp critical
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp critical
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1398,6 +1801,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp critical
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // PARALLEL FOR DIRECTIVE
 #pragma omp parallel for
@@ -1550,6 +1976,27 @@ void foo() {
   }
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1565,6 +2012,28 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL FOR SIMD DIRECTIVE
 #pragma omp parallel for simd
@@ -1575,7 +2044,7 @@ void foo() {
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -1717,6 +2186,27 @@ void foo() {
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -1732,6 +2222,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL SECTIONS DIRECTIVE
 #pragma omp parallel sections
@@ -1875,6 +2388,25 @@ void foo() {
   }
 #pragma omp parallel sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp parallel sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1890,6 +2422,28 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TASK DIRECTIVE
 #pragma omp task
@@ -1979,6 +2533,25 @@ void foo() {
   }
 #pragma omp task
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp task
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp task
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp task
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp task
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -1994,6 +2567,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp task
+  {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ORDERED DIRECTIVE
 #pragma omp ordered
@@ -2141,6 +2737,37 @@ void foo() {
   }
 #pragma omp ordered
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp ordered
+  {
+#pragma omp target parallel for ordered
+    for (int j = 0; j < 10; ++j) {
+#pragma omp ordered // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp ordered
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp ordered
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp ordered
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2156,6 +2783,30 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp ordered
+  {
+    bar();
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp ordered
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'ordered' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
@@ -2322,6 +2973,35 @@ void foo() {
   // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
   // expected-note@+1 {{expected an expression statement}}
   {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
@@ -2341,6 +3021,37 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    bar();
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TARGET DIRECTIVE
 #pragma omp target
@@ -2432,10 +3143,19 @@ void foo() {
   }
 #pragma omp target
   {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
 #pragma omp target
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
   {
 #pragma omp teams
     ++a;
@@ -2446,6 +3166,12 @@ void foo() {
 #pragma omp teams  // expected-note {{nested teams construct here}}
     ++a;
   }
+#pragma omp target // expected-error {{target construct with nested teams region contains statements outside of the teams construct}}
+  {
+    while (0)      // expected-note {{statement outside teams construct here}}
+#pragma omp teams  // expected-note {{nested teams construct here}}
+    ++a;
+  }
 #pragma omp target
   {
 #pragma omp taskloop
@@ -2458,6 +3184,402 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp target
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL DIRECTIVE
+#pragma omp target parallel
+#pragma omp parallel
+  bar();
+#pragma omp target parallel
+#pragma omp for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel region}}
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp single
+  bar();
+
+#pragma omp target parallel
+#pragma omp master
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp critical
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp task
+  {
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp barrier
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+  {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+    ++a;
+#pragma omp teams  // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL FOR DIRECTIVE
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single // OK
+      {
+        bar();
+      }
+#pragma omp for // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp for simd // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp sections // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel for ordered
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // OK
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }    
 
 // TEAMS DIRECTIVE
 #pragma omp target
@@ -2575,6 +3697,29 @@ void foo() {
 #pragma omp target
 #pragma omp teams
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target enter data' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target exit data' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2598,6 +3743,41 @@ void foo() {
 #pragma omp distribute
   for (int j = 0; j < 10; ++j)
     ;        
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target update' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
 
 // TASKLOOP DIRECTIVE
 #pragma omp taskloop
@@ -2740,6 +3920,27 @@ void foo() {
   }
 #pragma omp taskloop
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -2749,6 +3950,36 @@ void foo() {
   for (int i = 0; i < 10; ++i)
     ++a;
   }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    bar();
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int j = 0; j < 10; ++j)
+    ++a;
+  }
+
 // DISTRIBUTE DIRECTIVE
 #pragma omp target
 #pragma omp teams
@@ -2927,7 +4158,36 @@ void foo() {
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
 #pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
@@ -2937,6 +4197,495 @@ void foo() {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+
+// DISTRIBUTE PARALLEL FOR DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a distribute parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// DISTRIBUTE PARALLEL FOR SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd 
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel  // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ++a;
+  }
 }
 
 void foo() {
@@ -3033,6 +4782,25 @@ void foo() {
   }
 #pragma omp parallel
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp parallel
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3048,6 +4816,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SIMD DIRECTIVE
 #pragma omp simd
@@ -3058,7 +4849,7 @@ void foo() {
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -3165,6 +4956,27 @@ void foo() {
   }
 #pragma omp simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -3180,6 +4992,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    a++;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR DIRECTIVE
 #pragma omp for
@@ -3317,6 +5152,27 @@ void foo() {
   }
 #pragma omp for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3332,6 +5188,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // FOR SIMD DIRECTIVE
 #pragma omp for simd
@@ -3342,7 +5221,7 @@ void foo() {
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -3449,6 +5328,27 @@ void foo() {
   }
 #pragma omp for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -3464,6 +5364,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // SECTIONS DIRECTIVE
 #pragma omp sections
@@ -3593,6 +5516,25 @@ void foo() {
   }
 #pragma omp sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3608,6 +5550,28 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SECTION DIRECTIVE
 #pragma omp section // expected-error {{orphaned 'omp section' directives are prohibited, it must be closely nested to a sections region}}
@@ -3803,6 +5767,39 @@ void foo() {
   {
 #pragma omp section
     {
+#pragma omp target parallel
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target parallel for
+      for (int i = 0; i < 10; ++i)
+        ;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target enter data map(to: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target exit data map(from: a)
+      ++a;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+    {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
       ++a;
     }
@@ -3823,6 +5820,35 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp sections
+  {
+#pragma omp section
+    {
+#pragma omp target update to(a)
+      a++;
+    }
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp sections
+  {
+#pragma omp section
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'section' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // SINGLE DIRECTIVE
 #pragma omp single
@@ -3950,6 +5976,27 @@ void foo() {
   }
 #pragma omp single
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp single
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp single
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -3965,6 +6012,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp single
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp single
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'single' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // MASTER DIRECTIVE
 #pragma omp master
@@ -4102,6 +6172,27 @@ void foo() {
   }
 #pragma omp master
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp master
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4117,6 +6208,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp master
+  {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp master
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'master' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // CRITICAL DIRECTIVE
 #pragma omp critical
@@ -4273,6 +6387,27 @@ void foo() {
   }
 #pragma omp critical
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp critical
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp critical
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4288,6 +6423,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp critical
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp critical
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'critical' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // PARALLEL FOR DIRECTIVE
 #pragma omp parallel for
@@ -4440,6 +6598,27 @@ void foo() {
   }
 #pragma omp parallel for
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4455,6 +6634,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL FOR SIMD DIRECTIVE
 #pragma omp parallel for simd
@@ -4465,7 +6667,7 @@ void foo() {
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
-#pragma omp simd// expected-error {{OpenMP constructs may not be nested inside a simd region}}
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
     for (int i = 0; i < 10; ++i)
       ;
   }
@@ -4607,6 +6809,27 @@ void foo() {
   }
 #pragma omp parallel for simd
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
     ++a;
   }
@@ -4622,6 +6845,29 @@ void foo() {
     for (int j = 0; j < 10; ++j)
       ;
   }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    a++;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // PARALLEL SECTIONS DIRECTIVE
 #pragma omp parallel sections
@@ -4761,6 +7007,25 @@ void foo() {
   }
 #pragma omp parallel sections
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target enter data map(to: a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp target exit data map(from: a)
+  }
+#pragma omp parallel sections
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4776,6 +7041,28 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp parallel sections
+  {
+#pragma omp target update to(a)
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp parallel sections
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'parallel sections' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TASK DIRECTIVE
 #pragma omp task
@@ -4864,6 +7151,25 @@ void foo() {
   }
 #pragma omp task
   {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp task
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp task
+  {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp task
+  {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp task
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -4879,6 +7185,29 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp task
+  {
+#pragma omp target update to(a)
+    a++;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp task
+  {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'task' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // ATOMIC DIRECTIVE
 #pragma omp atomic
@@ -5045,6 +7374,35 @@ void foo() {
   // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
   // expected-note@+1 {{expected an expression statement}}
   {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target parallel for // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    ++a;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
 #pragma omp teams // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
     ++a;
   }
@@ -5064,6 +7422,36 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for// expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // TARGET DIRECTIVE
 #pragma omp target
@@ -5155,10 +7543,27 @@ void foo() {
   }
 #pragma omp target
   {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
 #pragma omp target
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+  }
+#pragma omp target
   {
 #pragma omp teams
     ++a;
@@ -5181,6 +7586,398 @@ void foo() {
     for (int i = 0; i < 10; ++i)
       ;
   }
+#pragma omp atomic
+  // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}}
+  // expected-note@+1 {{expected an expression statement}}
+  {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside an atomic region}}
+    a++;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL DIRECTIVE
+#pragma omp target parallel
+#pragma omp parallel
+  bar();
+#pragma omp target parallel
+#pragma omp for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel region}}
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp single
+  bar();
+
+#pragma omp target parallel
+#pragma omp master
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp critical
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+#pragma omp parallel sections
+  {
+    bar();
+  }
+#pragma omp target parallel
+#pragma omp task
+  {
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp barrier
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel
+  {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel' region}}
+    ++a;
+  }
+#pragma omp target parallel
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel
+  {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+    ++a;
+#pragma omp teams  // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel
+  {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel' region}}
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel
+  { 
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// TARGET PARALLEL FOR DIRECTIVE
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a target parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single // OK
+      {
+        bar();
+      }
+#pragma omp for // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp for simd // OK
+      for (int i = 0; i < 10; ++i)
+        ;
+#pragma omp sections // OK
+      {
+        bar();
+      }
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'target parallel for' region}}
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target parallel for ordered
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // OK
+    bar();
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target parallel for' region}}
+    a++;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int j = 0; j < 10; ++j)
+      ;
+  }
 
 // TEAMS DIRECTIVE
 #pragma omp target
@@ -5298,6 +8095,27 @@ void foo() {
 #pragma omp target
 #pragma omp teams
   {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target enter data' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target exit data' directive into a parallel region?}}
+  }
+#pragma omp target
+#pragma omp teams
+  {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -5321,6 +8139,47 @@ void foo() {
 #pragma omp distribute
   for (int j = 0; j < 10; ++j)
     ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp target update to(a) // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp target update' directive into a parallel region?}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute parallel for simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
+#pragma omp target
+#pragma omp teams
+  {
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp distribute simd
+  for (int j = 0; j < 10; ++j)
+    ;  
+  }
 
 // TASKLOOP DIRECTIVE
 #pragma omp taskloop
@@ -5463,6 +8322,27 @@ void foo() {
   }
 #pragma omp taskloop
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
@@ -5472,6 +8352,35 @@ void foo() {
   for (int i = 0; i < 10; ++i)
     ++a;
   }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a)
+    ++a;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp taskloop
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'taskloop' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
 
 // DISTRIBUTE DIRECTIVE
 #pragma omp target
@@ -5651,16 +8560,786 @@ void foo() {
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
 #pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
     ++a;
   }
 #pragma omp target
 #pragma omp teams
 #pragma omp distribute
   for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
 #pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
     ++a;
   }
   return foo<int>();
-}
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute parallel for simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
+
+  // DISTRIBUTE PARALLEL FOR DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute parallel for' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute' directive into a teams region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp for simd' directive into a parallel region?}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp sections' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{'omp section' directive must be closely nested to a sections region, not a distribute parallel for region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp single' directive into a parallel region?}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp ordered' directive into a for or a parallel for region with 'ordered' clause?}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel for // expected-error {{region cannot be nested inside 'target' region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp teams' directive into a target region?}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{region cannot be nested inside 'target' region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{region cannot be closely nested inside 'distribute parallel for' region; perhaps you forget to enclose 'omp distribute simd' directive into a teams region?}}
+  for (int i = 0; i < 10; ++i)
+    ++a;
+  }
 
+// DISTRIBUTE PARALLEL FOR SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute parallel for simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+
+// DISTRIBUTE SIMD DIRECTIVE
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp simd // expected-warning {{OpenMP only allows an ordered construct with the simd clause nested in a simd construct}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams  
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp section // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp single // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp master // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp critical // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+#pragma omp single
+      {
+	bar();
+      }
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel for simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp parallel sections // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp task // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    {
+      bar();
+    }
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskyield // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp barrier // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp flush // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp ordered // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    bar();
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp atomic // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target parallel // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp teams // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+  return foo<int>();
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target enter data map(to: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target exit data map(from: a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp target update to(a) // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    ++a;
+  }
+#pragma omp target
+#pragma omp teams
+#pragma omp distribute simd
+  for (int i = 0; i < 10; ++i) {
+#pragma omp distribute simd // expected-error {{OpenMP constructs may not be nested inside a simd region}}
+    for (int i = 0; i < 10; ++i)
+      ;
+  }
+}
diff --git a/test/OpenMP/nvptx_target_codegen.cpp b/test/OpenMP/nvptx_target_codegen.cpp
new file mode 100644
index 0000000000000..c4df636367d3b
--- /dev/null
+++ b/test/OpenMP/nvptx_target_codegen.cpp
@@ -0,0 +1,581 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// CHECK-DAG: [[OMP_NT:@.+]] = common addrspace(3) global i32 0
+// CHECK-DAG: [[OMP_WID:@.+]] = common addrspace(3) global i64 0
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l86}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l86]]()
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T1]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target
+  {
+  }
+
+  // CHECK-NOT: define {{.*}}void [[T2:@__omp_offloading_.+foo.+]]_worker()
+  #pragma omp target if(0)
+  {
+  }
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l157}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l157]](i[[SZ:32|64]] [[ARG1:%.+]])
+  // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
+  // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
+  // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T3]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK: load i16, i16* [[AA_CADDR]],
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target if(1)
+  {
+    aa += 1;
+  }
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l260}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+foo.+l260]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
+  // CHECK:    [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_BN:%.+]] = alloca float*
+  // CHECK:    [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]*
+  // CHECK:    [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]
+  // CHECK:    [[LOCAL_CN:%.+]] = alloca double*
+  // CHECK:    [[LOCAL_D:%.+]] = alloca [[TT:%.+]]*
+  // CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG: store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
+  // CHECK-DAG: store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]]
+  // CHECK-DAG: store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
+  // CHECK-DAG: store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]]
+  // CHECK-DAG: store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]]
+  // CHECK-DAG: store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]]
+  //
+  // CHECK-64-DAG: [[REF_A:%.+]] = bitcast i64* [[LOCAL_A]] to i32*
+  // CHECK-DAG:    [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]],
+  // CHECK-DAG:    [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
+  // CHECK-DAG:    [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]],
+  // CHECK-DAG:    [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]],
+  // CHECK-DAG:    [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
+  // CHECK-DAG:    [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]],
+  // CHECK-DAG:    [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]],
+  // CHECK-DAG:    [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T4]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // Use captures.
+  // CHECK-64-DAG:  load i32, i32* [[REF_A]]
+  // CHECK-32-DAG:  load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:  getelementptr inbounds [10 x float], [10 x float]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  // CHECK-DAG:  getelementptr inbounds float, float* [[REF_BN]], i[[SZ]] 3
+  // CHECK-DAG:  getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] 0, i[[SZ]] 1
+  // CHECK-DAG:  getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}}
+  // CHECK-DAG:     getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  #pragma omp target if(n>20)
+  {
+    a += 1;
+    b[2] += 1.0;
+    bn[3] += 1.0;
+    c[1][2] += 1.0;
+    cn[1][3] += 1.0;
+    d.X += 1;
+    d.Y += 1;
+  }
+
+  return a;
+}
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target if(n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  short aa = 0;
+  char aaa = 0;
+  int b[10];
+
+  #pragma omp target if(n>50)
+  {
+    a += 1;
+    aa += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+    #pragma omp target if(n>60)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+};
+
+int bar(int n){
+  int a = 0;
+
+  a += foo(n);
+
+  S1 S;
+  a += S.r1(n);
+
+  a += fstatic(n);
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+l297}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+static.+l297]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AAA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_B:%.+]] = alloca [10 x i32]*
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]]
+  // CHECK-DAG:  store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
+  // Store captures in the context.
+  // CHECK-64-DAG:   [[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  // CHECK-DAG:      [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+  // CHECK-DAG:      [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8*
+  // CHECK-DAG:      [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T5]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:    load i16, i16* [[REF_AA]]
+  // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l315}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+S1.+l315]](
+  // Create local storage for each capture.
+  // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
+  // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+  // CHECK:       [[LOCAL_C:%.+]] = alloca i16*
+  // CHECK-DAG:   store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
+  // CHECK-DAG:   store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
+  // CHECK-DAG:   store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]]
+  // Store captures in the context.
+  // CHECK-DAG:   [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]],
+  // CHECK-64-DAG:[[REF_B:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32*
+  // CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
+  // CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
+  // CHECK-DAG:   [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]],
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // Use captures.
+  // CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
+  // CHECK-64-DAG:load i32, i32* [[REF_B]]
+  // CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
+  // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l280}}_worker()
+  // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+  //
+  // CHECK: [[AWAIT_WORK]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+  //
+  // CHECK: [[SEL_WORKERS]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+  //
+  // CHECK: [[EXEC_PARALLEL]]
+  // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+  //
+  // CHECK: [[TERM_PARALLEL]]
+  // CHECK: br label {{%?}}[[BAR_PARALLEL]]
+  //
+  // CHECK: [[BAR_PARALLEL]]
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[AWAIT_WORK]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+
+  // CHECK: define {{.*}}void [[T7:@__omp_offloading_.+template.+l280]](i[[SZ]]
+  // Create local storage for each capture.
+  // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+  // CHECK:  [[LOCAL_B:%.+]] = alloca [10 x i32]*
+  // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+  // CHECK-DAG:  store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+  // CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
+  // Store captures in the context.
+  // CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+  // CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+  // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+  //
+  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
+  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
+  // CHECK: [[MID:%.+]] = and i32 [[B]],
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
+  //
+  // CHECK: [[CHECK_WORKER]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  //
+  // CHECK: [[WORKER]]
+  // CHECK: call void [[T7]]_worker()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[MASTER]]
+  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  //
+  // CHECK-64-DAG: load i32, i32* [[REF_A]]
+  // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
+  // CHECK-DAG:    load i16, i16* [[REF_AA]]
+  // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  //
+  // CHECK: br label {{%?}}[[TERM:.+]]
+  //
+  // CHECK: [[TERM]]
+  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: call void @llvm.nvvm.barrier0()
+  // CHECK: br label {{%?}}[[EXIT]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+#endif
diff --git a/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
new file mode 100644
index 0000000000000..5dcff8e548426
--- /dev/null
+++ b/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -0,0 +1,223 @@
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// TCHECK:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK:  [[S1:%.+]] = type { double }
+
+int foo(int n, double *ptr) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  double c[5][10];
+  TT<long long, char> d;
+  
+  #pragma omp target firstprivate(a)
+  {
+  }
+  
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]])
+  // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT:  alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+  // TCHECK:  ret void  
+
+#pragma omp target firstprivate(aa,b,c,d)
+  {
+    aa += 1;
+    b[2] = 1.0;
+    c[1][2] = 1.0;
+    d.X = 1;
+    d.Y = 1;    
+  }
+  
+  // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
+  // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B_ADDR:%.+]] = alloca [10 x float]*,
+  // TCHECK:  [[C_ADDR:%.+]] = alloca [5 x [10 x double]]*,
+  // TCHECK:  [[D_ADDR:%.+]] = alloca [[TT]]*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[B_PRIV:%.+]] = alloca [10 x float],
+  // TCHECK:  [[C_PRIV:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D_PRIV:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[A2_IN]], i{{[0-9]+}}* [[A2_ADDR]],
+  // TCHECK:  store [10 x float]* [[B_IN]], [10 x float]** [[B_ADDR]],
+  // TCHECK:  store [5 x [10 x double]]* [[C_IN]], [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  store [[TT]]* [[D_IN]], [[TT]]** [[D_ADDR]],
+  // TCHECK:  [[CONV_A2ADDR:%.+]] = bitcast i{{[0-9]+}}* [[A2_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x float]*, [10 x float]** [[B_ADDR]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  [[D_ADDR_REF:%.+]] = load [[TT]]*, [[TT]]** [[D_ADDR]],
+
+  // firstprivate(aa): a_priv = a_in
+
+  //  firstprivate(b): memcpy(b_priv,b_in)
+  // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
+  // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+
+  // firstprivate(c)
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  
+  // firstprivate(d)
+  // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
+  // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+
+  // TCHECK: load i16, i16* [[CONV_A2ADDR]],
+
+  
+  #pragma omp target firstprivate(ptr)
+  {
+    ptr[0]++;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]])
+  // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // TCHECK-NOT: alloca double*,
+  // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
+  // TCHECK:  [[PTR_IN_REF:%.+]] = load double*, double** [[PTR_ADDR]],
+  // TCHECK-NOT:  store double* [[PTR_IN_REF]], double** [[PTR_PRIV]],
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  tx b[10];
+
+#pragma omp target firstprivate(a,b)
+  {
+    a += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target firstprivate(a,aaa,b)
+  {
+    a += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[A3_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT:  alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store i{{[0-9]+}} [[A3_IN]], i{{[0-9]+}}* [[A3_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[A3_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A3_ADDR]] to i8*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a): a_priv = a_in
+
+// firstprivate(aaa)
+
+// TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+
+#pragma omp target firstprivate(b)
+    {
+      this->a = (double)b + 1.5;
+    }
+
+    return (int)b;
+  }
+
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[B_IN:%.+]])
+  // TCHECK:  [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+
+  // TCHECK:  store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[B_ADDR]],
+  // TCHECK:  [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK-64:  [[B_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[B_ADDR]] to i{{[0-9]+}}*
+
+  // firstprivate(b)
+  // TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+  // TCHECK: ret void
+};
+
+
+
+int bar(int n, double *ptr){
+  int a = 0;
+  a += foo(n, ptr);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a)
+// TCHECK-NOT:  store i{{[0-9]+}} %{{.+}}, i{{[0-9]+}}*
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/nvptx_teams_codegen.cpp b/test/OpenMP/nvptx_teams_codegen.cpp
new file mode 100644
index 0000000000000..b26d47c706aca
--- /dev/null
+++ b/test/OpenMP/nvptx_teams_codegen.cpp
@@ -0,0 +1,132 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+#ifdef CK1
+
+template <typename T>
+int tmain(T argc) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return 0;
+}
+
+
+int main (int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// only nvptx side: do not outline teams region and do not call fork_teams
+// CK1:  define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  {{.+}} = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK1:  [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK1:  store {{.+}} 0, {{.+}},
+// CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
+// CK1-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK1:  store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+// target region in template
+// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]])
+// CK1: [[ARGCADDR_PTR:%.+]] = alloca i{{.+}}***,
+// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
+// CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
+// CK1: store i8*** [[ARGCADDR]], i8**** [[ARGCADDR_PTR]],
+// CK1: [[ARGCADDR_PTR_REF:%.+]] = load i{{.+}}**, i{{.+}}*** [[ARGCADDR_PTR]],
+// CK1: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK1-NOT: call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK1:  ret void
+// CK1-NEXT: }
+
+
+#endif // CK1
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// expected-no-diagnostics
+#ifdef CK2
+
+template <typename T>
+int tmain(T argc) {
+  int a = 10;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int a = 20;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: {{.}} = alloca i{{[0-9]+}}*,
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}*,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2-NOT:  {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
+// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
+// CK2: store i{{[0-9]+}} [[ARGC_IN]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK2-64:  store i{{[0-9]+}}* [[CONV]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2-32:  store i{{[0-9]+}}* [[ARGCADDR]], i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2:  [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[ARGCADDR_PTR_REF]],
+// CK2-NOT:  {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2: ret
+
+// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[BP:%.+]], i{{[0-9]+}}** [[ARGC:%.+]])
+// CK2: [[ARGCADDR_PTR:%.+]] = alloca i{{[0-9]+}}***,
+// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
+// CK2: [[ARGCADDR:%.+]] = alloca i{{[0-9]+}}**,
+// CK2-NOT: {{%.+}} = call i32 @__kmpc_global_thread_num(
+// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
+// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
+// CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
+// CK2: store i{{[0-9]+}}*** [[ARGCADDR]], i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: [[ARGCADDR_PTR_REF:%.+]] = load i{{[0-9]+}}***, i{{[0-9]+}}**** [[ARGCADDR_PTR]],
+// CK2: store i{{[0-9]+}}** null, i{{[0-9]+}}*** [[ARGCADDR_PTR_REF]],
+// CK2-NOT: {{.+}} = call i32 @__kmpc_push_num_teams(
+// CK2-NOT: call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(
+// CK2:  ret void
+
+#endif // CK2
+#endif
diff --git a/test/OpenMP/ordered_doacross_codegen.cpp b/test/OpenMP/ordered_doacross_codegen.cpp
new file mode 100644
index 0000000000000..d1fe99d4b826b
--- /dev/null
+++ b/test/OpenMP/ordered_doacross_codegen.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+// CHECK: [[KMP_DIM:%.+]] = type { i64, i64, i64 }
+extern int n;
+int a[10], b[10], c[10], d[10];
+void foo();
+
+// CHECK-LABEL: @main()
+int main() {
+  int i;
+// CHECK: [[DIMS:%.+]] = alloca [[KMP_DIM]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK: icmp
+// CHECK-NEXT: br i1 %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
+// CHECK: store i64 %{{.+}}, i64* %
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
+// CHECK: store i64 1, i64* %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK: call void @__kmpc_for_static_init_4(
+#pragma omp for ordered(1)
+  for (i = 0; i < n; ++i) {
+    a[i] = b[i] + 1;
+    foo();
+// CHECK: invoke void [[FOO:.+]](
+// CHECK: load i32, i32* [[CNT:%.+]],
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(source)
+    c[i] = c[i] + 1;
+    foo();
+// CHECK: invoke void [[FOO]]
+// CHECK: load i32, i32* [[CNT]],
+// CHECK-NEXT: sub nsw i32 %{{.+}}, 2
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(sink : i - 2)
+    d[i] = a[i - 2];
+  }
+  // CHECK: landingpad
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: br label %
+
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: ret i32 0
+  return 0;
+}
+
+// CHECK: define {{.+}}TestStruct
+template <typename T>
+struct TestStruct {
+  static const int M = 10;
+  static const int N = 20;
+  T i;
+  T a[N][M];
+  T b[N][M];
+  T foo(T, T);
+  T bar(T, T, T);
+  void baz(T, T);
+  TestStruct() {
+// CHECK: [[CNT:%.+]] = alloca i64,
+// CHECK: [[DIMS:%.+]] = alloca [[KMP_DIM]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT:%.+]])
+// CHECK: icmp
+// CHECK-NEXT: br i1 %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @llvm.memset.p0i8.i64(i8* [[CAST]], i8 0, i64 24, i32 8, i1 false)
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 1
+// CHECK: store i64 %{{.+}}, i64* %
+// CHECK: getelementptr inbounds [[KMP_DIM]], [[KMP_DIM]]* [[DIMS]], i32 0, i32 2
+// CHECK: store i64 1, i64* %
+// CHECK: [[CAST:%.+]] = bitcast [[KMP_DIM]]* [[DIMS]] to i8*
+// CHECK: call void @__kmpc_doacross_init([[IDENT]], i32 [[GTID]], i32 1, i8* [[CAST]])
+// CHECK: call void @__kmpc_for_static_init_8(
+#pragma omp for ordered(2)
+    for (T j = 0; j < M; j++)
+      for (i = 0; i < n; i += 2) {
+        a[i][j] = foo(i, j);
+// CHECK: invoke {{.+TestStruct.+foo}}
+// CHECK: load i64, i64* [[CNT]],
+// CHECK-NEXT: sub nsw i64 %{{.+}}, 1
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+// CHECK-NEXT: load i64, i64* [[CNT]],
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: mul nsw i32 1, %
+// CHECK-NEXT: sext i32 %{{.+}} to i64
+// CHECK-NEXT: sub nsw i64 %
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_wait([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(sink : j, i - 2) depend(sink : j - 1, i)
+        b[i][j] = bar(a[i][j], b[i - 1][j], b[i][j - 1]);
+// CHECK: invoke {{.+TestStruct.+bar}}
+// CHECK: load i64, i64* [[CNT]],
+// CHECK-NEXT: store i64 %{{.+}}, i64* [[TMP:%.+]],
+// CHECK-NEXT: call void @__kmpc_doacross_post([[IDENT]], i32 [[GTID]], i64* [[TMP]])
+#pragma omp ordered depend(source)
+        baz(a[i][j], b[i][j]);
+      }
+  }
+  // CHECK: landingpad
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: br label %
+
+  // CHECK: call void @__kmpc_for_static_fini(
+  // CHECK: call void @__kmpc_doacross_fini([[IDENT]], i32 [[GTID]])
+  // CHECK: ret
+};
+
+TestStruct<int> s;
+#endif // HEADER
diff --git a/test/OpenMP/parallel_ast_print.cpp b/test/OpenMP/parallel_ast_print.cpp
index 1e46fba48ca31..8a1533959e364 100644
--- a/test/OpenMP/parallel_ast_print.cpp
+++ b/test/OpenMP/parallel_ast_print.cpp
@@ -8,6 +8,113 @@
 
 void foo() {}
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+  S1& operator +(const S1&);
+  S1& operator *(const S1&);
+  S1& operator &&(const S1&);
+  S1& operator ^(const S1&);
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  T b[100];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a) firstprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel shared(a) shared(this->a) shared(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel reduction(+ : a) reduction(*: b[:])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel shared(a) shared(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel reduction(&& : this->a) reduction(^: b[s.a.a])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(this->S1::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(this->S1::a)
+// CHECK: #pragma omp parallel reduction(+: this->a) reduction(*: this->b[:])
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(T::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(T::a)
+// CHECK: #pragma omp parallel reduction(+: this->a) reduction(*: this->b[:])
+// CHECK: #pragma omp parallel private(this->a) private(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a)
+// CHECK: #pragma omp parallel reduction(&&: this->a) reduction(^: this->b[s.a.a])
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp parallel private(a) private(this->a) private(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a) firstprivate(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel shared(a) shared(this->a) shared(S7 < S1 > ::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+#pragma omp parallel reduction(^ : S7 < S1 > ::a) reduction(+ : S7 < S1 > ::b[ : S7 < S1 > ::a.a])
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel shared(a) shared(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+#pragma omp parallel reduction(* : this->a) reduction(&&:this->b[a.a:])
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(this->S7<S1>::a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(this->S7<S1>::a)
+// CHECK: #pragma omp parallel reduction(^: this->S7<S1>::a) reduction(+: this->S7<S1>::b[:this->S7<S1>::a.a])
+// CHECK: #pragma omp parallel private(this->a) private(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a)
+// CHECK: #pragma omp parallel shared(this->a) shared(this->a)
+// CHECK: #pragma omp parallel reduction(*: this->a) reduction(&&: this->b[this->a.a:])
+
 template <class T>
 struct S {
   operator T() {return T();}
@@ -120,4 +227,7 @@ void foo(const Foo<int> &arg) {
   }
 }
 
+template<typename T>
+T S<T>::TS = 0;
+
 #endif
diff --git a/test/OpenMP/parallel_copyin_codegen.cpp b/test/OpenMP/parallel_copyin_codegen.cpp
index ff76cfe4dd6c8..49e7b3fd618ba 100644
--- a/test/OpenMP/parallel_copyin_codegen.cpp
+++ b/test/OpenMP/parallel_copyin_codegen.cpp
@@ -87,10 +87,6 @@ int main() {
   // TLS-LAMBDA:     [[G_CPY_VAL:%.+]] = call{{( cxx_fast_tlscc)?}} i{{[0-9]+}}* [[G_CTOR:@.+]]()
   // TLS-LAMBDA:     call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G_CPY_VAL]])
 
-  // TLS-LAMBDA:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
-  // TLS-LAMBDA:     ret i{{[0-9]+}}* [[G]]
-  // TLS-LAMBDA:     }
-
 #pragma omp parallel copyin(g)
   {
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
@@ -122,6 +118,11 @@ int main() {
     g = 1;
     // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}*
     // TLS-LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}*
+
+    // TLS-LAMBDA:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
+    // TLS-LAMBDA:     ret i{{[0-9]+}}* [[G]]
+    // TLS-LAMBDA:     }
+
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
       // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
@@ -149,9 +150,6 @@ int main() {
   // TLS-BLOCKS:     [[G_CPY_VAL:%.+]] = call{{( cxx_fast_tlscc)?}} i{{[0-9]+}}* [[G_CTOR:@.+]]()
   // TLS-BLOCKS:     call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G_CPY_VAL]])
 
-  // TLS-BLOCKS:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
-  // TLS-BLOCKS:     ret i{{[0-9]+}}* [[G]]
-  // TLS-BLOCKS:     }
 #pragma omp parallel copyin(g)
   {
     // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
@@ -189,6 +187,10 @@ int main() {
     // TLS-BLOCKS: store volatile i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_CAPTURE_DST]]
     // TLS-BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // TLS-BLOCKS: call {{.*}}void {{%.+}}(i8
+
+    // TLS-BLOCKS:     define {{.*}}i{{[0-9]+}}* [[G_CTOR]]()
+    // TLS-BLOCKS:     ret i{{[0-9]+}}* [[G]]
+    // TLS-BLOCKS:     }
     ^{
       // BLOCKS: define {{.+}} void {{@.+}}(i8*
       // TLS-BLOCKS: define {{.+}} void {{@.+}}(i8*
diff --git a/test/OpenMP/parallel_firstprivate_codegen.cpp b/test/OpenMP/parallel_firstprivate_codegen.cpp
index d0da8cea87ac0..d20f3f5135691 100644
--- a/test/OpenMP/parallel_firstprivate_codegen.cpp
+++ b/test/OpenMP/parallel_firstprivate_codegen.cpp
@@ -1,8 +1,15 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple %itanium_abi_triple -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple %itanium_abi_triple -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple %itanium_abi_triple -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-32 %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-32 %s
+
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA -check-prefix=LAMBDA-64 %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-pc-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS -check-prefix=BLOCKS-64 %s
+
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
 // expected-no-diagnostics
 #ifndef ARRAY
@@ -18,6 +25,60 @@ struct St {
 
 volatile int g __attribute__((aligned(128))) = 1212;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  int e[4];
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c, e)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1, e[2] = 1111;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel firstprivate(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel firstprivate(a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel firstprivate(a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
 template <class T>
 struct S {
   T f;
@@ -28,14 +89,17 @@ struct S {
   ~S() {}
 };
 
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -52,33 +116,79 @@ T tmain() {
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]], {{.+}})
 #pragma omp parallel firstprivate(g, sivar)
   {
-    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
-    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz:i64|i32]], {{i64|i32}}, {{i64|i32}}, [4 x i{{[0-9]+}}]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}})
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
+    // LAMBDA-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+    // LAMBDA-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+    // LAMBDA-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
+    // LAMBDA-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+    // LAMBDA-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+    // LAMBDA-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+    // LAMBDA-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+    // LAMBDA-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+    // LAMBDA-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: ret void
+
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[iz]] {{.*}}%{{.+}})
     // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
     // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR:%.+]]
-    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR:%.+]]
+    // LAMBDA-64: [[SIVAR_PRIVATE_CONV:%.+]] = bitcast i64* [[SIVAR_PRIVATE_ADDR]] to i32*
     // LAMBDA: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]], align 128
     // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]], align 128
-    // LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
-    // LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
-    // LAMBDA: call {{.*}}void @__kmpc_barrier(
+    // LAMBDA-NOT: call {{.*}}void @__kmpc_barrier(
     g = 1;
     sivar = 2;
     // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
+    // LAMBDA-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]],
+    // LAMBDA-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
     // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
     // LAMBDA: store i{{[0-9]+}}* [[G_PRIVATE_ADDR]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
     // LAMBDA: [[SIVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-    // LAMBDA: store i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-64: store i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-32: store i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
     // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
     [&]() {
       // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
@@ -98,33 +208,32 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call {{.*}}void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
   // BLOCKS: call {{.*}}void {{.+}} @__kmpc_fork_call({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]], {{.+}})
 #pragma omp parallel firstprivate(g, sivar)
   {
-    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
-    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
+    // BLOCKS: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [[iz:i64|i32]] {{.*}}%{{.+}})
     // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
+    // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}}, align 128
     // BLOCKS: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_REF_ADDR:%.+]]
-    // BLOCKS: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR:%.+]]
+    // BLOCKS-64: [[SIVAR_PRIVATE_CONV:%.+]] = bitcast i64* [[SIVAR_PRIVATE_ADDR]] to i32*
     // BLOCKS: [[G_VAL:%.+]] = load volatile i{{[0-9]+}}, i{{[0-9]+}}* [[G_REF]], align 128
     // BLOCKS: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_PRIVATE_ADDR]], align 128
-    // BLOCK: [[SIVAR_REF_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
-    // BLOCK: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_REF_ADDR]]
-    // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]],
-    // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
-    // BLOCKS: call {{.*}}void @__kmpc_barrier(
+    // BLOCKS-NOT: call {{.*}}void @__kmpc_barrier(
     g = 1;
     sivar = 2;
     // BLOCKS: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_PRIVATE_ADDR]],
-    // BLOCKS: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
+    // BLOCKS-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]],
+    // BLOCKS-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]],
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS: i{{[0-9]+}}* [[G_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
     // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
-    // BLOCKS: i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+    // BLOCKS-64: i{{[0-9]+}}* [[SIVAR_PRIVATE_CONV]]
+    // BLOCKS-32: i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
     // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
     // BLOCKS: call {{.*}}void {{%.+}}(i8
     ^{
@@ -142,6 +251,48 @@ int main() {
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz]], [[iz]], [[iz]], [4 x i{{[0-9]+}}]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV]] to i32*
+// BLOCKS-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// BLOCKS-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+// BLOCKS-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV]] to i32*
+// BLOCKS-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV]] to i32*
+// BLOCKS-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+// BLOCKS-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+// BLOCKS-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+// BLOCKS-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+// BLOCKS-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -162,27 +313,40 @@ int main() {
 
 // CHECK: define {{.*}}i{{[0-9]+}} @main()
 // CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[T_VAR:%.+]] = alloca i32,
+// CHECK: [[T_VARCAST:%.+]] = alloca [[iz:i64|i32]],
+// CHECK: [[SIVARCAST:%.+]] = alloca [[iz]],
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
-// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i32*, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}}*)* [[MAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR]],
+// CHECK-64: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST]] to i32*
+// CHECK-64: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK-32: store i32 [[T_VARVAL]], i32* [[T_VARCAST]],
+// CHECK: [[T_VARPVT:%.+]] = load [[iz]], [[iz]]* [[T_VARCAST]],
+// CHECK: [[SIVARVAL:%.+]] = load i32, i32* @{{.+}},
+// CHECK-64: [[SIVARCONV:%.+]] = bitcast i64* [[SIVARCAST]] to i32*
+// CHECK-64: store i32 [[SIVARVAL]], i32* [[SIVARCONV]],
+// CHECK-32: store i32 [[SIVARVAL]], i32* [[SIVARCAST]],
+// CHECK: [[SIVARPVT:%.+]] = load [[iz]], [[iz]]* [[SIVARCAST]],
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, [[iz]], [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[MAIN_MICROTASK:@.+]] to void {{.*}}[[iz]] [[T_VARPVT]],{{.*}}[[iz]] [[SIVARPVT]]
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 // CHECK: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, i32* dereferenceable(4) [[SIVAR:%.+]])
+// CHECK: define internal {{.*}}void [[MAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [[iz]] {{.*}}%{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, [[iz]] {{.*}}[[SIVAR:%.+]])
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
-// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
 // CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
-// CHECK: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %{{.+}},
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
-// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
@@ -200,9 +364,8 @@ int main() {
 // CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// CHECK: [[SIVAR_REF_ADDR:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]],
-// CHECK: store i{{[0-9]+}} [[SIVAR_REF_ADDR]], i{{[0-9]+}}* [[SIVAR7_PRIV]],
-// CHECK: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+// CHECK-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
+// CHECK-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
 
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
@@ -214,6 +377,55 @@ int main() {
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, [[iz]], [[iz]], [[iz]], [4 x i32]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [[iz]] {{.+}}, [4 x i{{[0-9]+}}]* {{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[E_PRIV:%.+]] = alloca [4 x i{{[0-9]+}}],
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[A_PRIV]]
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[B_PRIV]]
+// CHECK: store i{{[0-9]+}} {{.+}}, i{{[0-9]+}}* [[C_PRIV]]
+// CHECK-64: [[A_CONV:%.+]] = bitcast i64* [[A_PRIV:%.+]] to i32*
+// CHECK-64: store i32* [[A_CONV]], i32** [[REFA:%.+]],
+// CHECK-32: store i32* [[A_PRIV]], i32** [[REFA:%.+]],
+// CHECK-64: [[B_CONV:%.+]] = bitcast i64* [[B_PRIV:%.+]] to i32*
+// CHECK-64: [[C_CONV:%.+]] = bitcast i64* [[C_PRIV:%.+]] to i32*
+// CHECK-64: store i32* [[C_CONV]], i32** [[REFC:%.+]],
+// CHECK-32: store i32* [[C_PRIV]], i32** [[REFC:%.+]],
+// CHECK: bitcast [4 x i{{[0-9]+}}]* [[E_PRIV]] to i8*
+// CHECK: bitcast [4 x i{{[0-9]+}}]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy
+// CHECK: store [4 x i{{[0-9]+}}]* [[E_PRIV]], [4 x i{{[0-9]+}}]** [[REFE:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-64-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_CONV]],
+// CHECK-32-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-64-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_CONV]],
+// CHECK-32-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[E_PRIV:%.+]] = load [4 x i{{[0-9]+}}]*, [4 x i{{[0-9]+}}]** [[REFE]],
+// CHECK-NEXT: [[E_PRIV_2:%.+]] = getelementptr inbounds [4 x i{{[0-9]+}}], [4 x i{{[0-9]+}}]* [[E_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK-NEXT: store i32 1111, i32* [[E_PRIV_2]],
+// CHECK-NEXT: ret void
+
 // CHECK: define internal {{.*}}void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
@@ -244,9 +456,7 @@ int main() {
 // CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call {{.*}}void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call {{.*}}void @__kmpc_barrier(
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
@@ -270,13 +480,13 @@ struct St {
 void array_func(float a[3], St s[2], int n, long double vla1[n]) {
   double vla2[n][n] __attribute__((aligned(128)));
 // ARRAY: @__kmpc_fork_call(
-// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float**,
-// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St**,
-// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80**,
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float*,
 // ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
-// ARRAY-DAG: store float** %{{.+}}, float*** [[PRIV_A]],
-// ARRAY-DAG: store %struct.St** %{{.+}}, %struct.St*** [[PRIV_S]],
-// ARRAY-DAG: store x86_fp80** %{{.+}}, x86_fp80*** [[PRIV_VLA1]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store float* %{{.+}}, float** [[PRIV_A]],
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
@@ -288,11 +498,11 @@ void array_func(float a[3], St s[2], int n, long double vla1[n]) {
 
 // ARRAY-LABEL: St_func
 // ARRAY: @__kmpc_fork_call(
-// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St**,
-// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80**,
+// ARRAY-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
 // ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
-// ARRAY-DAG: store %struct.St** %{{.+}}, %struct.St*** [[PRIV_S]],
-// ARRAY-DAG: store x86_fp80** %{{.+}}, x86_fp80*** [[PRIV_VLA1]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
 // ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
 // ARRAY: call i8* @llvm.stacksave()
 // ARRAY: [[SIZE:%.+]] = mul nuw i64 %{{.+}}, 8
diff --git a/test/OpenMP/parallel_for_ast_print.cpp b/test/OpenMP/parallel_for_ast_print.cpp
index c4be521455df2..2476ee87605aa 100644
--- a/test/OpenMP/parallel_for_ast_print.cpp
+++ b/test/OpenMP/parallel_for_ast_print.cpp
@@ -8,6 +8,57 @@
 
 void foo() {}
 
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp parallel for private(this->a) private(this->a)
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, h;
diff --git a/test/OpenMP/parallel_for_collapse_messages.cpp b/test/OpenMP/parallel_for_collapse_messages.cpp
index 6e5f71ff16727..4461df8cccdba 100644
--- a/test/OpenMP/parallel_for_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
   #pragma omp parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
-  #pragma omp parallel for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4{{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_linear_messages.cpp b/test/OpenMP/parallel_for_linear_messages.cpp
index 7272aada92a08..e5f5b61873c89 100644
--- a/test/OpenMP/parallel_for_linear_messages.cpp
+++ b/test/OpenMP/parallel_for_linear_messages.cpp
@@ -263,7 +263,7 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     ++k;
 
-  foomain<int, char>(argc, argv);
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/parallel_for_loop_messages.cpp b/test/OpenMP/parallel_for_loop_messages.cpp
index 2bb32bdfc1768..7e136e75869cd 100644
--- a/test/OpenMP/parallel_for_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_loop_messages.cpp
@@ -354,12 +354,12 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -402,7 +402,7 @@ int test_with_random_access_iterator() {
 #pragma omp parallel for
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel for
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/parallel_for_ordered_messages.cpp b/test/OpenMP/parallel_for_ordered_messages.cpp
index 3729eb9a68a35..055fe1bba4237 100644
--- a/test/OpenMP/parallel_for_ordered_messages.cpp
+++ b/test/OpenMP/parallel_for_ordered_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -36,16 +41,23 @@ T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}
 #pragma omp parallel for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for', but found only 1}}
-// expected-error@+3 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
-// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+// expected-error@+6 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+5 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 #pragma omp parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
 #pragma omp parallel for ordered(S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
-// expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++)
     argv[0][i] = argv[0][i] - argv[0][i - ST];
@@ -76,10 +88,17 @@ int main(int argc, char **argv) {
 #pragma omp parallel for ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp parallel for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp parallel for', but found only 1}}
-#pragma omp parallel for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+// expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp parallel for ordered(foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+// expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
 // expected-error@+2 2 {{directive '#pragma omp parallel for' cannot contain more than one 'ordered' clause}}
 // expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
 #pragma omp parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
@@ -88,7 +107,11 @@ int main(int argc, char **argv) {
 #pragma omp parallel for ordered(S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
-// expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+// expected-error@+4 {{expression is not an integral constant expression}}
+#else
+// expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
 #pragma omp parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++)
     argv[0][i] = argv[0][i] - argv[0][i - 4];
diff --git a/test/OpenMP/parallel_for_private_messages.cpp b/test/OpenMP/parallel_for_private_messages.cpp
index efc827b0d5e56..cc1b79f4111b0 100644
--- a/test/OpenMP/parallel_for_private_messages.cpp
+++ b/test/OpenMP/parallel_for_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel for private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     m = k + 2;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_for_reduction_messages.cpp b/test/OpenMP/parallel_for_reduction_messages.cpp
index 22251b446300b..4d5a143bae3a3 100644
--- a/test/OpenMP/parallel_for_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@ T tmain(T argc) {
 #pragma omp parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(&& : argc)
@@ -120,22 +120,22 @@ T tmain(T argc) {
 #pragma omp parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@ T tmain(T argc) {
 #pragma omp parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -160,7 +160,7 @@ T tmain(T argc) {
 #pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -251,13 +251,13 @@ int main(int argc, char **argv) {
 #pragma omp parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/parallel_for_simd_aligned_messages.cpp b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
index 8bffd21f4655f..2ccdf0697624e 100644
--- a/test/OpenMP/parallel_for_simd_aligned_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@ int main(int argc, char **argv) {
   #pragma omp parallel for simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/parallel_for_simd_ast_print.cpp b/test/OpenMP/parallel_for_simd_ast_print.cpp
index 1b9415d8dbf30..cdd1b73b59032 100644
--- a/test/OpenMP/parallel_for_simd_ast_print.cpp
+++ b/test/OpenMP/parallel_for_simd_ast_print.cpp
@@ -7,6 +7,58 @@
 #define HEADER
 
 void foo() {}
+
+struct S1 {
+  S1() : a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp parallel for simd private(a) private(this->a) private(S7<S1>::a) 
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp parallel for simd private(this->a) private(this->a)
+
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
   N i;
diff --git a/test/OpenMP/parallel_for_simd_collapse_messages.cpp b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
index 4f04cca5635eb..c7effbddeb28c 100644
--- a/test/OpenMP/parallel_for_simd_collapse_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp parallel for simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for simd', but found only 1}}
   #pragma omp parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp parallel for simd', but found only 1}}
-  #pragma omp parallel for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_simd_linear_messages.cpp b/test/OpenMP/parallel_for_simd_linear_messages.cpp
index 858f53f4674da..fc1895a36854a 100644
--- a/test/OpenMP/parallel_for_simd_linear_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_linear_messages.cpp
@@ -208,7 +208,7 @@ int main(int argc, char **argv) {
   #pragma omp parallel for simd linear(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  foomain<int,char>(argc,argv);
+  foomain<int,char>(argc,argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
   return 0;
 }
 
diff --git a/test/OpenMP/parallel_for_simd_loop_messages.cpp b/test/OpenMP/parallel_for_simd_loop_messages.cpp
index e5fd8c04c0cfc..403e951d53c43 100644
--- a/test/OpenMP/parallel_for_simd_loop_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_loop_messages.cpp
@@ -355,12 +355,12 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -403,7 +403,7 @@ int test_with_random_access_iterator() {
 #pragma omp parallel for simd
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel for simd
   for (begin = begin0; begin < end; ++begin)
diff --git a/test/OpenMP/parallel_for_simd_private_messages.cpp b/test/OpenMP/parallel_for_simd_private_messages.cpp
index a031d407ec1bf..a33b35d57d1ef 100644
--- a/test/OpenMP/parallel_for_simd_private_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -119,6 +167,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel for simd private // expected-error {{expected '(' after 'private'}}
@@ -180,6 +230,8 @@ int main(int argc, char **argv) {
   for (int k = 0; k < argc; ++k)
     m = k + 3;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_for_simd_reduction_messages.cpp b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
index e2e9e1bca38b0..afb0b367c4193 100644
--- a/test/OpenMP/parallel_for_simd_reduction_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@ T tmain(T argc) {
 #pragma omp parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(&& : argc)
@@ -120,22 +120,22 @@ T tmain(T argc) {
 #pragma omp parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@ T tmain(T argc) {
 #pragma omp parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -160,7 +160,7 @@ T tmain(T argc) {
 #pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -251,13 +251,13 @@ int main(int argc, char **argv) {
 #pragma omp parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp parallel for simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/parallel_for_simd_safelen_messages.cpp b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
index 45f2fa2b6277c..3e643c619c6fc 100644
--- a/test/OpenMP/parallel_for_simd_safelen_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd safelen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
index dd1cf0feaa497..fa9e0d6802a9c 100644
--- a/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
+++ b/test/OpenMP/parallel_for_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp parallel for simd simdlen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp parallel for simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp parallel for simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp parallel for simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp parallel for simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp parallel for simd' must be a for loop}}
diff --git a/test/OpenMP/parallel_num_threads_codegen.cpp b/test/OpenMP/parallel_num_threads_codegen.cpp
index d744e5e87b748..c5f11bde19aed 100644
--- a/test/OpenMP/parallel_num_threads_codegen.cpp
+++ b/test/OpenMP/parallel_num_threads_codegen.cpp
@@ -72,11 +72,11 @@ int main() {
 // CHECK:       [[GTID:%.+]] = call {{.*}}i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEF_LOC_2]])
 // CHECK:       call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 1)
 // CHECK:       call {{.*}}void {{.*}} @__kmpc_fork_call(
-// CHECK:       call {{.*}} [[S_TY_CONSTR]]([[S_TY]]* [[S_TEMP:%.+]], [[INTPTR_T_TY]] [[INTPTR_T_TY_ATTR]]23)
+// CHECK:       {{(invoke|call)}} {{.*}} [[S_TY_CONSTR]]([[S_TY]]* [[S_TEMP:%.+]], [[INTPTR_T_TY]] [[INTPTR_T_TY_ATTR]]23)
 // CHECK:       [[S_CHAR_OP:%.+]] = invoke{{.*}} i8 [[S_TY_CHAR_OP]]([[S_TY]]* [[S_TEMP]])
 // CHECK:       [[RES:%.+]] = sext {{.*}}i8 [[S_CHAR_OP]] to i32
 // CHECK:       call {{.*}}void @__kmpc_push_num_threads([[IDENT_T_TY]]* [[DEF_LOC_2]], i32 [[GTID]], i32 [[RES]])
-// CHECK:       call {{.*}} [[S_TY_DESTR]]([[S_TY]]* [[S_TEMP]])
+// CHECK:       {{(invoke|call)}} {{.*}} [[S_TY_DESTR]]([[S_TY]]* [[S_TEMP]])
 // CHECK:       call {{.*}}void {{.*}} @__kmpc_fork_call(
 // CHECK:       ret [[INT_TY]] 0
 // CHECK:       }
diff --git a/test/OpenMP/parallel_private_codegen.cpp b/test/OpenMP/parallel_private_codegen.cpp
index 1d195be97d6e8..1498d456dfa55 100644
--- a/test/OpenMP/parallel_private_codegen.cpp
+++ b/test/OpenMP/parallel_private_codegen.cpp
@@ -18,11 +18,69 @@ struct S {
 
 volatile int g __attribute__((aligned(128))) = 1212;
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel private(a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel private(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel private(a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel private(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel private(a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel private(a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
 template <typename T>
 T tmain() {
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T();
   T vec[] __attribute__((aligned(128))) = {1, 2};
   S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
@@ -37,16 +95,49 @@ T tmain() {
 
 int main() {
   static int sivar;
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA-NOT: = getelementptr inbounds %{{.+}},
   // LAMBDA: call{{.*}} void {{.+}} @__kmpc_fork_call({{.+}}, i32 0, {{.+}}* [[OMP_REGION:@.+]] to {{.+}})
 #pragma omp parallel private(g, sivar)
   {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: store i8
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
     // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
@@ -80,6 +171,7 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call{{.*}} void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -116,6 +208,35 @@ int main() {
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: store i8
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: ret void
 #else
   S<float> test;
   int t_var = 0;
@@ -166,6 +287,31 @@ int main() {
 // CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: store i8
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
@@ -184,5 +330,20 @@ int main() {
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[VAR_PRIV]])
 // CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
 // CHECK: ret void
+
+// CHECK: define {{.+}} @{{.+}}([[SST_TY]]* %
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SST_TY]]*)* [[SST_MICROTASK:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal void [[SST_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SST_TY]]* %{{.+}})
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REF:%.+]],
+// CHECK-NEXT: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REF]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: ret void
+
 #endif
 
diff --git a/test/OpenMP/parallel_reduction_codegen.cpp b/test/OpenMP/parallel_reduction_codegen.cpp
index 05224d0a1391c..703750c2337f7 100644
--- a/test/OpenMP/parallel_reduction_codegen.cpp
+++ b/test/OpenMP/parallel_reduction_codegen.cpp
@@ -20,6 +20,62 @@ struct S {
   ~S() {}
 };
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel reduction(+: a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel reduction(&: a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ++a;
+      --this->b;
+      (this)->c /= 1;
+#pragma omp parallel reduction(-: a, b, c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel reduction(*: a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel reduction(&& :a)
+        ++(this)->a;
+      }();
+    }();
+#elif defined(BLOCKS)
+    ^{
+      ^{
+        ++a;
+#pragma omp parallel reduction(|: a)
+        ++(this)->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// BLOCKS: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
 // CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
 // CHECK-DAG: [[REDUCTION_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 18, i32 0, i32 0, i8*
@@ -29,6 +85,7 @@ template <typename T>
 T tmain() {
   T t;
   S<T> test;
+  SST<T> sst;
   T t_var __attribute__((aligned(128))) = T(), t_var1 __attribute__((aligned(128)));
   T vec[] = {1, 2};
   S<T> s_arr[]  = {1, 2};
@@ -41,16 +98,62 @@ T tmain() {
   return T();
 }
 
+int sivar;
 int main() {
+  SS ss(sivar);
 #ifdef LAMBDA
   // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // LAMBDA-LABEL: @main
-  // LAMBDA: call void [[OUTER_LAMBDA:@.+]](
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
   // LAMBDA: call void {{.+}} @__kmpc_fork_call({{.+}}, i32 1, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* [[G]])
 #pragma omp parallel reduction(+:g)
   {
+    // LAMBDA: define {{.+}} @{{.+}}([[SS_TY]]*
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: store i8
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+    // LAMBDA-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+    // LAMBDA: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i32*, i32*, i32*)* [[SS_MICROTASK:@.+]] to void
+    // LAMBDA: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+    // LAMBDA: store i8 %{{.+}}, i8* [[B_REF]],
+    // LAMBDA: ret
+
+    // LAMBDA: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+    // LAMBDA-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+    // LAMBDA: call{{.*}} void
+    // LAMBDA: ret void
+
+    // LAMBDA: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]*
+    // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA: store i{{[0-9]+}} -1, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+    // LAMBDA: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+    // LAMBDA-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+    // LAMBDA-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+    // LAMBDA-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+    // LAMBDA-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+    // LAMBDA-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+    // LAMBDA: call i32 @__kmpc_reduce_nowait(
+    // LAMBDA: ret void
+
     // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}})
     // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = alloca i{{[0-9]+}},
 
@@ -100,6 +203,7 @@ int main() {
 #elif defined(BLOCKS)
   // BLOCKS: [[G:@.+]] = global i{{[0-9]+}} 1212,
   // BLOCKS-LABEL: @main
+  // BLOCKS: call
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
@@ -152,6 +256,47 @@ int main() {
   }
   }();
   return 0;
+// BLOCKS: define {{.+}} @{{.+}}([[SS_TY]]*
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: store i8
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// BLOCKS-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// BLOCKS: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// BLOCKS: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i32*, i32*, i32*)* [[SS_MICROTASK:@.+]] to void
+// BLOCKS: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// BLOCKS: store i8 %{{.+}}, i8* [[B_REF]],
+// BLOCKS: ret
+
+// BLOCKS: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS-NOT: getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %
+// BLOCKS: call{{.*}} void
+// BLOCKS: ret void
+
+// BLOCKS: define internal void @{{.+}}(i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]* %{{.+}}, i32* {{.+}}, i32* {{.+}}, i32* {{.+}})
+// BLOCKS: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// BLOCKS: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// BLOCKS-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// BLOCKS-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// BLOCKS-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// BLOCKS-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// BLOCKS-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// BLOCKS: call i32 @__kmpc_reduce_nowait(
+// BLOCKS: ret void
 #else
   S<float> test;
   float t_var = 0, t_var1;
@@ -472,6 +617,43 @@ int main() {
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
+// CHECK: define {{.+}} @{{.+}}([[SS_TY]]*
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* %
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: store i8
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NOT: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[SS_TY]], [[SS_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*, i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}*)* [[SS_MICROTASK:@.+]] to void
+// CHECK: [[B_REF:%.+]] = getelementptr {{.*}}[[SS_TY]], [[SS_TY]]* %{{.*}}, i32 0, i32 1
+// CHECK: store i8 %{{.+}}, i8* [[B_REF]],
+// CHECK: ret
+
+// CHECK: define internal void [[SS_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [[SS_TY]]*
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[REFA:%.+]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK: store i{{[0-9]+}} 0, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[REFC:%.+]],
+// CHECK: [[A_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFA]],
+// CHECK-NEXT: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[INC:%.+]] = add nsw i{{[0-9]+}} [[A_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[INC]], i{{[0-9]+}}* [[A_PRIV]],
+// CHECK-NEXT: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[DEC:%.+]] = add nsw i{{[0-9]+}} [[B_VAL]], -1
+// CHECK-NEXT: store i{{[0-9]+}} [[DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK-NEXT: [[C_PRIV:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[REFC]],
+// CHECK-NEXT: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_PRIV]],
+// CHECK-NEXT: [[DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK-NEXT: store i{{[0-9]+}} [[DIV]], i{{[0-9]+}}* [[C_PRIV]],
+// CHECK: call i32 @__kmpc_reduce_nowait(
+// CHECK: ret void
+
 // CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
diff --git a/test/OpenMP/parallel_reduction_messages.cpp b/test/OpenMP/parallel_reduction_messages.cpp
index b29f7c98a5744..af1f5ed7bc0d9 100644
--- a/test/OpenMP/parallel_reduction_messages.cpp
+++ b/test/OpenMP/parallel_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -101,23 +101,23 @@ T tmain(T argc) {
   foo();
 #pragma omp parallel reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   foo();
 #pragma omp parallel reduction(&& : argc)
   foo();
 #pragma omp parallel reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   foo();
-#pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
@@ -127,14 +127,14 @@ T tmain(T argc) {
   foo();
 #pragma omp parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
-#pragma omp parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   foo();
 #pragma omp parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
 #pragma omp parallel private(k)
 #pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
-#pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   foo();
 #pragma omp parallel reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
   foo();
@@ -208,11 +208,11 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
-#pragma omp parallel reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
-#pragma omp parallel reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
   foo();
diff --git a/test/OpenMP/parallel_sections_ast_print.cpp b/test/OpenMP/parallel_sections_ast_print.cpp
index 9f5c1fadbeb82..a66b75ea6d796 100644
--- a/test/OpenMP/parallel_sections_ast_print.cpp
+++ b/test/OpenMP/parallel_sections_ast_print.cpp
@@ -141,4 +141,7 @@ int main(int argc, char **argv) {
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+template<typename T>
+T S<T>::TS = 0;
+
 #endif
diff --git a/test/OpenMP/parallel_sections_private_messages.cpp b/test/OpenMP/parallel_sections_private_messages.cpp
index ac9280e74ce10..40b0138b53967 100644
--- a/test/OpenMP/parallel_sections_private_messages.cpp
+++ b/test/OpenMP/parallel_sections_private_messages.cpp
@@ -29,7 +29,13 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
 };
 class S5 {
   int a;
@@ -37,6 +43,60 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp parallel sections private(a) private(this->a) private(T::a)
+    {
+      for (int k = 0; k < a.a; ++k)
+        ++this->a.a;
+    }
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp parallel sections private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a.a; ++k)
+        ++s.a.a;
+    }
+    return *this;
+  }
 };
 
 S3 h;
@@ -134,6 +194,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp parallel sections private // expected-error {{expected '(' after 'private'}}
@@ -212,6 +274,8 @@ int main(int argc, char **argv) {
     foo();
   }
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/parallel_sections_reduction_messages.cpp b/test/OpenMP/parallel_sections_reduction_messages.cpp
index eff1849d71323..52d4cb9cdcb65 100644
--- a/test/OpenMP/parallel_sections_reduction_messages.cpp
+++ b/test/OpenMP/parallel_sections_reduction_messages.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 100 -o - %s
-// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
 
 void foo() {
 }
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -121,7 +121,7 @@ T tmain(T argc) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp parallel sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   {
     foo();
   }
@@ -133,11 +133,11 @@ T tmain(T argc) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp parallel sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp parallel sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -145,15 +145,15 @@ T tmain(T argc) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp parallel sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -173,7 +173,7 @@ T tmain(T argc) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp parallel sections reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   {
     foo();
   }
@@ -186,7 +186,7 @@ T tmain(T argc) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp parallel sections reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   {
     foo();
   }
@@ -298,15 +298,15 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#pragma omp parallel sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp parallel sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp parallel sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
-#pragma omp parallel sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp parallel sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
diff --git a/test/OpenMP/predefined_macro.c b/test/OpenMP/predefined_macro.c
index 9a961bce552f2..e18c3d26d4c22 100644
--- a/test/OpenMP/predefined_macro.c
+++ b/test/OpenMP/predefined_macro.c
@@ -5,7 +5,7 @@
 // -fopenmp option is specified
 #ifndef _OPENMP
 #error "No _OPENMP macro is defined with -fopenmp option"
-#elsif _OPENMP != 201307
+#elsif _OPENMP != 201107
 #error "_OPENMP has incorrect value"
 #endif //_OPENMP
 #else
diff --git a/test/OpenMP/sections_firstprivate_codegen.cpp b/test/OpenMP/sections_firstprivate_codegen.cpp
index 0e9273f52ca77..51d0c7b61834c 100644
--- a/test/OpenMP/sections_firstprivate_codegen.cpp
+++ b/test/OpenMP/sections_firstprivate_codegen.cpp
@@ -59,7 +59,6 @@ S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
 // CHECK-DAG: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[SECTIONS_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 194, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
@@ -94,7 +93,7 @@ int main() {
     // LAMBDA: [[SIVAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR1_REF]]
     // LAMBDA: store i{{[0-9]+}} [[SIVAR1_VAL]], i{{[0-9]+}}* [[SIVAR1_PRIVATE_ADDR]]
 
-    // LAMBDA: call void @__kmpc_barrier(
+    // LAMBDA-NOT: call void @__kmpc_barrier(
     {
       g = 1;
       sivar = 10;
@@ -154,7 +153,7 @@ int main() {
 
     // BLOCKS: [[SIVAR1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR1_REF]],
     // BLOCKS: store i{{[0-9]+}} [[SIVAR1_VAL]], i{{[0-9]+}}* [[SIVAR1_PRIVATE_ADDR]],
-    // BLOCKS: call void @__kmpc_barrier(
+    // BLOCKS-NOT: call void @__kmpc_barrier(
     {
       g = 1;
       sivar = 10;
@@ -217,6 +216,7 @@ int main() {
 // firstprivate t_var(t_var)
 // CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR]],
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
+
 // firstprivate vec(vec)
 // CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
 // CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* bitcast ([2 x i{{[0-9]+}}]* [[VEC]] to i8*),
@@ -242,7 +242,7 @@ int main() {
 // CHECK: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR]],
 // CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIV]],
 
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call void @__kmpc_barrier(
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
 
@@ -257,7 +257,11 @@ int main() {
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* %{{.+}},
+// CHECK: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST:%.+]] to i32*
+// CHECK: store i32  [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK: [[T_VARPVT:%.+]] = load i64, i64* [[T_VARCAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void {{.*}}i64 [[T_VARPVT]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
@@ -268,19 +272,20 @@ int main() {
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
 // CHECK: alloca i{{[0-9]+}},
+// CHECK: alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}*
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
 
 // firstprivate vec(vec)
@@ -304,10 +309,8 @@ int main() {
 // CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
 // CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
 
-// Synchronization for initialization.
-// CHECK: [[GTID_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[GTID_ADDR_ADDR]]
-// CHECK: [[GTID:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[GTID_REF]]
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// No synchronization for initialization.
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: call void @__kmpc_for_static_init_4(
 // CHECK: call void @__kmpc_for_static_fini(
diff --git a/test/OpenMP/sections_private_messages.cpp b/test/OpenMP/sections_private_messages.cpp
index f13bbdb012e22..27bb3136e7201 100644
--- a/test/OpenMP/sections_private_messages.cpp
+++ b/test/OpenMP/sections_private_messages.cpp
@@ -29,7 +29,13 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
 };
 class S5 {
   int a;
@@ -37,6 +43,60 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp sections private(a) private(this->a)
+    {
+      for (int k = 0; k < v; ++k)
+        ++this->a;
+    }
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a; ++k)
+        ++s.a;
+    }
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp sections private(a) private(this->a) private(T::a)
+    {
+      for (int k = 0; k < a.a; ++k)
+        ++this->a.a;
+    }
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp sections private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    {
+      for (int k = 0; k < s.a.a; ++k)
+        ++s.a.a;
+    }
+    return *this;
+  }
 };
 
 S3 h;
@@ -134,6 +194,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp sections private // expected-error {{expected '(' after 'private'}}
@@ -212,6 +274,8 @@ int main(int argc, char **argv) {
     foo();
   }
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/sections_reduction_messages.cpp b/test/OpenMP/sections_reduction_messages.cpp
index 79473d4e5d289..134bf619c9113 100644
--- a/test/OpenMP/sections_reduction_messages.cpp
+++ b/test/OpenMP/sections_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -132,7 +132,7 @@ T tmain(T argc) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp sections reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   {
     foo();
   }
@@ -147,12 +147,12 @@ T tmain(T argc) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp sections reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp sections reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -162,17 +162,17 @@ T tmain(T argc) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
@@ -197,7 +197,7 @@ T tmain(T argc) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp sections reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   {
     foo();
   }
@@ -212,7 +212,7 @@ T tmain(T argc) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp sections reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   {
     foo();
   }
@@ -342,17 +342,17 @@ int main(int argc, char **argv) {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp sections reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp sections reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
 #pragma omp parallel
-#pragma omp sections reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp sections reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   {
     foo();
   }
diff --git a/test/OpenMP/simd_aligned_messages.cpp b/test/OpenMP/simd_aligned_messages.cpp
index 6be7529ad5119..9515a0bca68f1 100644
--- a/test/OpenMP/simd_aligned_messages.cpp
+++ b/test/OpenMP/simd_aligned_messages.cpp
@@ -196,6 +196,7 @@ int main(int argc, char **argv) {
   #pragma omp simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/simd_ast_print.cpp b/test/OpenMP/simd_ast_print.cpp
index cabbe338db2a4..99c00c6dca814 100644
--- a/test/OpenMP/simd_ast_print.cpp
+++ b/test/OpenMP/simd_ast_print.cpp
@@ -6,6 +6,58 @@
 #ifndef HEADER
 #define HEADER
 
+struct SS {
+  SS(): a(0) {}
+  SS(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T *a;
+  T b[2];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type &v) : a((T*)&v) {
+#pragma omp simd aligned(a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp simd aligned(this->b : 8)
+    for (int k = 0; k < s.a->a; ++k)
+      ++s.a->a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp simd aligned(this->a)
+// CHECK: #pragma omp simd aligned(this->a)
+// CHECK: #pragma omp simd aligned(this->b: 8)
+
+class S8 : public S7<SS> {
+  S8() {}
+
+public:
+  S8(int v) : S7<SS>(v){
+#pragma omp simd aligned(S7<SS>::a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp simd aligned(this->b: 4)
+    for (int k = 0; k < s.a->a; ++k)
+      ++s.a->a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp simd aligned(this->S7<SS>::a)
+// CHECK: #pragma omp simd aligned(this->b: 4)
+
 void foo() {}
 int g_ind = 1;
 template<class T, class N> T reduct(T* arr, N num) {
diff --git a/test/OpenMP/simd_codegen.cpp b/test/OpenMP/simd_codegen.cpp
index 62028339f5142..29828b39173cd 100644
--- a/test/OpenMP/simd_codegen.cpp
+++ b/test/OpenMP/simd_codegen.cpp
@@ -4,8 +4,10 @@
 // RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // REQUIRES: x86-registered-target
 // expected-no-diagnostics
-#ifndef HEADER
-#define HEADER
+ #ifndef HEADER
+ #define HEADER
+
+// CHECK: [[SS_TY:%.+]] = type { i32 }
 
 long long get_val() { return 0; }
 double *g_ptr;
@@ -207,6 +209,7 @@ void simple(float *a, float *b, float *c, float *d) {
 // CHECK-NEXT: store i64 [[ADD7_2]], i64* [[OMP_IV7]]{{.*}}!llvm.mem.parallel_loop_access ![[SIMPLE_LOOP7_ID]]
   }
 // CHECK: [[SIMPLE_LOOP7_END]]
+// CHECK-NEXT: store i64 11, i64*
 // CHECK-NEXT: [[A_PRIV_VAL:%.+]] = load i32, i32* [[A_PRIV]],
 // CHECK-NEXT: store i32 [[A_PRIV_VAL]], i32* [[A]],
   int R;
@@ -321,7 +324,6 @@ public:
 // CHECK-LABEL: define {{.*void}} @{{.*}}iter_simple{{.*}}
 void iter_simple(IterDouble ia, IterDouble ib, IterDouble ic) {
 //
-// CHECK: store i32 0, i32* [[IT_OMP_IV:%[^,]+]]
 // Calculate number of iterations before the loop body.
 // CHECK: [[DIFF1:%.+]] = invoke {{.*}}i32 @{{.*}}IterDouble{{.*}}
 // CHECK: [[DIFF2:%.+]] = sub nsw i32 [[DIFF1]], 1
@@ -329,6 +331,7 @@ void iter_simple(IterDouble ia, IterDouble ib, IterDouble ic) {
 // CHECK-NEXT: [[DIFF4:%.+]] = sdiv i32 [[DIFF3]], 1
 // CHECK-NEXT: [[DIFF5:%.+]] = sub nsw i32 [[DIFF4]], 1
 // CHECK-NEXT: store i32 [[DIFF5]], i32* [[OMP_LAST_IT:%[^,]+]]{{.+}}
+// CHECK: store i32 0, i32* [[IT_OMP_IV:%[^,]+]]
   #pragma omp simd
 
 // CHECK: [[IV:%.+]] = load i32, i32* [[IT_OMP_IV]]{{.+}} !llvm.mem.parallel_loop_access ![[ITER_LOOP_ID:[0-9]+]]
@@ -416,9 +419,10 @@ void collapsed(float *a, float *b, float *c, float *d) {
 // CHECK: [[COLL1_END]]
   }
 // i,j,l are updated; k is not updated.
-// CHECK: store i32 3, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i32 5, i32* [[I:%[^,]+]]
-// CHECK-NEXT: store i16 9, i16* [[I:%[^,]+]]
+// CHECK: store i32 3, i32*
+// CHECK-NEXT: store i32 5, i32*
+// CHECK-NEXT: store i32 7, i32*
+// CHECK-NEXT: store i16 9, i16*
 // CHECK: ret void
 }
 
@@ -490,8 +494,10 @@ void linear(float *a) {
 
   #pragma omp simd linear(k : 3)
 // CHECK: store i64* [[VAL_ADDR]], i64** [[K_ADDR]],
+// CHECK: [[VAL_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: store i64* [[VAL_REF]], i64** [[K_ADDR_REF:%.+]],
 // CHECK: store i32 0, i32* [[OMP_IV:%[^,]+]]
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_REF]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
@@ -524,7 +530,7 @@ void linear(float *a) {
 // CHECK: [[SIMPLE_LOOP_END]]
 //
 // Update linear vars after loop, as the loop was operating on a private version.
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: store i64* [[K_REF]], i64** [[K_PRIV_REF:%.+]],
 // CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
 // CHECK-NEXT: [[LIN_ADD2:%.+]] = add nsw i64 [[LIN0_2]], 27
@@ -533,8 +539,10 @@ void linear(float *a) {
 //
 
   #pragma omp simd linear(val(k) : 3)
+// CHECK: [[VAL_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: store i64* [[VAL_REF]], i64** [[K_ADDR_REF:%.+]],
 // CHECK: store i32 0, i32* [[OMP_IV:%[^,]+]]
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: [[K0LOAD:%.+]] = load i64, i64* [[K_REF]]
 // CHECK-NEXT: store i64 [[K0LOAD]], i64* [[LIN0:%[^,]+]]
 
@@ -567,7 +575,7 @@ void linear(float *a) {
 // CHECK: [[SIMPLE_LOOP_END]]
 //
 // Update linear vars after loop, as the loop was operating on a private version.
-// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR]],
+// CHECK: [[K_REF:%.+]] = load i64*, i64** [[K_ADDR_REF]],
 // CHECK: store i64* [[K_REF]], i64** [[K_PRIV_REF:%.+]],
 // CHECK: [[LIN0_2:%.+]] = load i64, i64* [[LIN0]]
 // CHECK-NEXT: [[LIN_ADD2:%.+]] = add nsw i64 [[LIN0_2]], 27
@@ -632,5 +640,68 @@ void parallel_simd(float *a) {
     a[i] += bar();
 }
 // TERM_DEBUG: !{{[0-9]+}} = !DILocation(line: [[@LINE-11]],
+
+// CHECK-LABEL: S8
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 15
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 7
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 15
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+
+// CHECK: ptrtoint [[SS_TY]]* %{{.+}} to i64
+// CHECK-NEXT: and i64 %{{.+}}, 3
+// CHECK-NEXT: icmp eq i64 %{{.+}}, 0
+// CHECK-NEXT: call void @llvm.assume(i1
+struct SS {
+  SS(): a(0) {}
+  SS(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T *a;
+  T b[2];
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type &v) : a((T*)&v) {
+#pragma omp simd aligned(a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+#pragma omp simd aligned(this->b : 8)
+    for (int k = 0; k < a->a; ++k)
+      ++a->a;
+  }
+};
+
+class S8 : private IterDouble, public S7<SS> {
+  S8() {}
+
+public:
+  S8(int v) : S7<SS>(v){
+#pragma omp parallel private(a)
+#pragma omp simd aligned(S7<SS>::a)
+    for (int k = 0; k < a->a; ++k)
+      ++this->a->a;
+#pragma omp parallel shared(b)
+#pragma omp simd aligned(this->b: 4)
+    for (int k = 0; k < a->a; ++k)
+      ++a->a;
+  }
+};
+S8 s8(0);
+
 #endif // HEADER
 
diff --git a/test/OpenMP/simd_collapse_messages.cpp b/test/OpenMP/simd_collapse_messages.cpp
index e34f0a15b20c9..5b8802468849b 100644
--- a/test/OpenMP/simd_collapse_messages.cpp
+++ b/test/OpenMP/simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp simd', but found only 1}}
   #pragma omp simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}} expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp simd', but found only 1}}
-  #pragma omp simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
diff --git a/test/OpenMP/simd_lastprivate_messages.cpp b/test/OpenMP/simd_lastprivate_messages.cpp
index 7cc5ba81f78b4..16223db71ad57 100644
--- a/test/OpenMP/simd_lastprivate_messages.cpp
+++ b/test/OpenMP/simd_lastprivate_messages.cpp
@@ -217,5 +217,5 @@ int main(int argc, char **argv) {
 #pragma omp simd lastprivate(t) // OK
   for (i = 0; i < argc; ++i)
     foo();
-  return 0;
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
diff --git a/test/OpenMP/simd_private_messages.cpp b/test/OpenMP/simd_private_messages.cpp
index 3442d182ed02e..1850101891f7e 100644
--- a/test/OpenMP/simd_private_messages.cpp
+++ b/test/OpenMP/simd_private_messages.cpp
@@ -26,13 +26,61 @@ class S4 {
   int a;
   S4(); // expected-note {{implicitly declared private here}}
 public:
-  S4(int v):a(v) { }
+  S4(int v) : a(v) {
+#pragma omp simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
   S5():a(0) {} // expected-note {{implicitly declared private here}}
 public:
   S5(int v):a(v) { }
+  S5 &operator=(S5 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -96,6 +144,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
   #pragma omp simd private // expected-error {{expected '(' after 'private'}}
@@ -137,6 +187,8 @@ int main(int argc, char **argv) {
   #pragma omp simd private(i)
   for (int k = 0; k < argc; ++k) ++k;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/simd_reduction_messages.cpp b/test/OpenMP/simd_reduction_messages.cpp
index e082921cf3839..c47d53eb9185e 100644
--- a/test/OpenMP/simd_reduction_messages.cpp
+++ b/test/OpenMP/simd_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -111,7 +111,7 @@ T tmain(T argc) {
 #pragma omp simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(&& : argc)
@@ -120,22 +120,22 @@ T tmain(T argc) {
 #pragma omp simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -150,7 +150,7 @@ T tmain(T argc) {
 #pragma omp simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -163,7 +163,7 @@ T tmain(T argc) {
 #pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -254,13 +254,13 @@ int main(int argc, char **argv) {
 #pragma omp simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
-#pragma omp simd reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   for (int i = 0; i < 10; ++i)
     foo();
 #pragma omp simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
diff --git a/test/OpenMP/simd_safelen_messages.cpp b/test/OpenMP/simd_safelen_messages.cpp
index aa31b7da9b7fc..56cb868337e0f 100644
--- a/test/OpenMP/simd_safelen_messages.cpp
+++ b/test/OpenMP/simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd safelen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp simd' must be a for loop}}
diff --git a/test/OpenMP/simd_simdlen_messages.cpp b/test/OpenMP/simd_simdlen_messages.cpp
index 91656f87b583e..426d1878ba4ca 100644
--- a/test/OpenMP/simd_simdlen_messages.cpp
+++ b/test/OpenMP/simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp simd simdlen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp simd' must be a for loop}}
diff --git a/test/OpenMP/single_ast_print.cpp b/test/OpenMP/single_ast_print.cpp
index 8eb35174a77f2..d30b7feb02c57 100644
--- a/test/OpenMP/single_ast_print.cpp
+++ b/test/OpenMP/single_ast_print.cpp
@@ -8,15 +8,43 @@
 
 void foo() {}
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+// CHECK: #pragma omp parallel firstprivate(this->a,this->b,this->c)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a,this->b,this->c)
+    ++this->a, --b, (this)->c /= 1;
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+// CHECK: #pragma omp parallel firstprivate(this->a)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a)
+// CHECK: #pragma omp parallel firstprivate(this->a)
+// CHECK-NEXT: #pragma omp single copyprivate(this->a)
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate(this->a)
+    ++this->a;
+  }
+};
+
 template <class T, int N>
 T tmain(T argc) {
   T b = argc, c, d, e, f, g;
   static T a;
+  SST<T> sst;
 // CHECK: static T a;
 #pragma omp parallel private(g)
 #pragma omp single private(argc, b), firstprivate(c, d), nowait
   foo();
-  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK: #pragma omp parallel private(g)
   // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(c,d) nowait
   // CHECK-NEXT: foo();
 #pragma omp parallel private(g)
@@ -31,11 +59,12 @@ T tmain(T argc) {
 int main(int argc, char **argv) {
   int b = argc, c, d, e, f, g;
   static int a;
+  SS ss(a);
 // CHECK: static int a;
 #pragma omp parallel private(g)
 #pragma omp single private(argc, b), firstprivate(argv, c), nowait
   foo();
-  // CHECK-NEXT: #pragma omp parallel private(g)
+  // CHECK: #pragma omp parallel private(g)
   // CHECK-NEXT: #pragma omp single private(argc,b) firstprivate(argv,c) nowait
   // CHECK-NEXT: foo();
 #pragma omp parallel private(g)
diff --git a/test/OpenMP/single_codegen.cpp b/test/OpenMP/single_codegen.cpp
index 61a93a5a1b735..a2140c2eda78b 100644
--- a/test/OpenMP/single_codegen.cpp
+++ b/test/OpenMP/single_codegen.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-unknown-unknown -emit-llvm %s -fexceptions -fcxx-exceptions -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -fnoopenmp-use-tls -x c++ -triple x86_64-unknown-unknown -fexceptions -fcxx-exceptions -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -fnoopenmp-use-tls -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -std=c++11 -fopenmp -fnoopenmp-use-tls -fexceptions -fcxx-exceptions -debug-info-kind=line-tables-only -x c++ -emit-llvm %s -o - | FileCheck %s --check-prefix=TERM_DEBUG
 // RUN: %clang_cc1 -verify -fopenmp -fnoopenmp-use-tls -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
 // expected-no-diagnostics
 // REQUIRES: x86-registered-target
@@ -19,7 +19,9 @@ public:
 };
 
 // CHECK-DAG:   [[TEST_CLASS_TY:%.+]] = type { i{{[0-9]+}} }
-// CHECK:       [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
+// CHECK-DAG:   [[SST_TY:%.+]] = type { double }
+// CHECK-DAG:   [[SS_TY:%.+]] = type { i32, i8, i32* }
+// CHECK-DAG:   [[IDENT_T_TY:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK:       [[IMPLICIT_BARRIER_SINGLE_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK:       define void [[FOO:@.+]]()
@@ -30,6 +32,39 @@ TestClass tc2[2];
 
 void foo() {}
 
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+#pragma omp parallel firstprivate(a, b, c)
+#pragma omp single copyprivate(a, this->b, (this)->c)
+      ++(this)->a, --b, this->c /= 1;
+    }();
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate(this->a)
+    [&]() {
+      [&]() {
+        ++this->a;
+#pragma omp parallel firstprivate(a)
+#pragma omp single copyprivate((this)->a)
+        ++(this)->a;
+      }();
+    }();
+  }
+};
+
 // CHECK-LABEL: @main
 // TERM_DEBUG-LABEL: @main
 int main() {
@@ -39,6 +74,8 @@ int main() {
   char a;
   char a2[2];
   TestClass &c = tc;
+  SST<double> sst;
+  SS ss(c.a);
 
 // CHECK:       [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[IDENT_T_TY]]* [[DEFAULT_LOC:@.+]])
 // CHECK-DAG:   [[DID_IT:%.+]] = alloca i32,
@@ -74,8 +111,8 @@ int main() {
 // CHECK-NEXT:  invoke void [[FOO]]()
 // CHECK:       to label {{%?}}[[CONT:.+]] unwind
 // CHECK:       [[CONT]]
-// CHECK:       store i32 1, i32* [[DID_IT]]
 // CHECK:       call void @__kmpc_end_single([[IDENT_T_TY]]* [[DEFAULT_LOC]], i32 [[GTID]])
+// CHECK:       store i32 1, i32* [[DID_IT]]
 // CHECK-NEXT:  br label {{%?}}[[EXIT]]
 // CHECK:       [[EXIT]]
 // CHECK:       [[A_PTR_REF:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[COPY_LIST]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -186,3 +223,210 @@ void array_func(int n, int a[n], St s[2]) {
 // ARRAY: store i32* %{{.+}}, i32** %{{.+}},
 // ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
 #endif
+
+// CHECK-LABEL:@_ZN2SSC2ERi(
+// CHECK: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SS_TY]]*, i64, i64, i64)* [[SS_MICROTASK:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[SS_MICROTASK]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SS_TY]]* {{.+}}, i64 {{.+}}, i64 {{.+}}, i64 {{.+}})
+// Private a
+// CHECK: alloca i64,
+// Private b
+// CHECK: alloca i64,
+// Private c
+// CHECK: alloca i64,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: [[DID_IT:%.+]] = alloca i32,
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: store i32 0, i32* [[DID_IT]],
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK: getelementptr inbounds [[CAP_TY:%.+]], [[CAP_TY]]* [[CAP:%.+]], i32 0, i32 0
+// CHECK: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: store i32* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: store i32* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: store i32* %
+// CHECK-LABEL: invoke void @_ZZN2SSC1ERiENKUlvE_clEv(
+// CHECK-SAME: [[CAP_TY]]* [[CAP]])
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK: store i32 1, i32* [[DID_IT]],
+// CHECK: br label
+
+// CHECK: call void @__kmpc_end_single(%{{.+}}* @{{.+}}, i32 %{{.+}})
+// CHECK: br label
+
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 1
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 2
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [3 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 24, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT: ret void
+
+// CHECK-LABEL: @_ZZN2SSC1ERiENKUlvE_clEv(
+// CHECK: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP:%.+]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, -1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: sdiv i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 1
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 2
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: getelementptr inbounds [[CAP_TY]], [[CAP_TY]]* [[CAP]], i32 0, i32 3
+// CHECK-NEXT: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: bitcast i64* %
+// CHECK-NEXT: store i32 %{{.+}}, i32* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 4, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SS_TY]]*, i64, i64, i64)* [[SS_MICROTASK1:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK: define internal void [[SS_MICROTASK1]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SS_TY]]* {{.+}}, i64 {{.+}}, i64 {{.+}}, i64 {{.+}})
+// Private a
+// CHECK: alloca i64,
+// Private b
+// CHECK: alloca i64,
+// Private c
+// CHECK: alloca i64,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: alloca i32*,
+// CHECK: [[DID_IT:%.+]] = alloca i32,
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: bitcast i64* %{{.+}} to i32*
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32, i32* %
+// CHECK-NEXT: add nsw i32 %{{.+}}, -1
+// CHECK-NEXT: store i32 %
+// CHECK-NOT: getelementptr inbounds
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: load i32, i32* %
+// CHECK-NEXT: sdiv i32 %{{.+}}, 1
+// CHECK-NEXT: store i32 %
+// CHECK-NEXT: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: store i32 1, i32* [[DID_IT]],
+// CHECK-NEXT: br label
+
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 1
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK: getelementptr inbounds [3 x i8*], [3 x i8*]* [[LIST]], i64 0, i64 2
+// CHECK: load i32*, i32** %
+// CHECK-NEXT: bitcast i32* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [3 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 24, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT:  ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK-LABEL: @_ZN3SSTIdEC2Ev
+// CHECK: getelementptr inbounds [[SST_TY]], [[SST_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NEXT: store double 0.000000e+00, double* %
+// CHECK-NEXT: getelementptr inbounds [[SST_TY]], [[SST_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK-NEXT: store double* %{{.+}}, double** %
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: load double, double* %
+// CHECK-NEXT: bitcast i64* %{{.+}} to double*
+// CHECK-NEXT: store double %{{.+}}, double* %
+// CHECK-NEXT: load i64, i64* %
+// CHECK-NEXT: call void ([[IDENT_T_TY]]*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call([[IDENT_T_TY]]* @{{.+}}, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, [[SST_TY]]*, i64)* [[SST_MICROTASK:@.+]] to void
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[SST_MICROTASK]](i32* {{[^,]+}}, i32* {{[^,]+}}, [[SST_TY]]* {{.+}}, i64 {{.+}})
+// CHECK: [[RES:%.+]] = call i32 @__kmpc_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: icmp ne i32 [[RES]], 0
+// CHECK-NEXT: br i1
+
+// CHECK: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: store double* %
+// CHECK-LABEL: invoke void @_ZZN3SSTIdEC1EvENKUlvE_clEv(
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: store i32 1, i32* [[DID_IT]],
+// CHECK-NEXT: br label
+
+// CHECK: call void @__kmpc_end_single([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}})
+// CHECK-NEXT: br label
+
+// CHECK: getelementptr inbounds [1 x i8*], [1 x i8*]* [[LIST:%.+]], i64 0, i64 0
+// CHECK: load double*, double** %
+// CHECK-NEXT: bitcast double* %
+// CHECK-NEXT: store i8* %
+// CHECK-NEXT: bitcast [1 x i8*]* [[LIST]] to i8*
+// CHECK-NEXT: load i32, i32* [[DID_IT]],
+// CHECK-NEXT: call void @__kmpc_copyprivate([[IDENT_T_TY]]* @{{.+}}, i32 %{{.+}}, i64 8, i8* %{{.+}}, void (i8*, i8*)* [[COPY_FUNC:@[^,]+]], i32 %{{.+}})
+// CHECK-NEXT:  ret void
+
+// CHECK-LABEL: @_ZZN3SSTIdEC1EvENKUlvE_clEv(
+// CHECK: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 1
+// CHECK-NEXT: load double*, double** %
+// CHECK-NEXT: store double* %
+// CHECK-LABEL: call void @_ZZZN3SSTIdEC1EvENKUlvE_clEvENKUlvE_clEv(
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void [[COPY_FUNC]](i8*, i8*)
+// CHECK: ret void
+
+// CHECK-LABEL: @_ZZZN3SSTIdEC1EvENKUlvE_clEvENKUlvE_clEv(
diff --git a/test/OpenMP/single_firstprivate_codegen.cpp b/test/OpenMP/single_firstprivate_codegen.cpp
index cc72addb5f453..537ae76f12772 100644
--- a/test/OpenMP/single_firstprivate_codegen.cpp
+++ b/test/OpenMP/single_firstprivate_codegen.cpp
@@ -57,7 +57,6 @@ int vec[] = {1, 2};
 S<float> s_arr[] = {1, 2};
 // CHECK-DAG: [[VAR:@.+]] = global [[S_FLOAT_TY]] zeroinitializer,
 S<float> var(3);
-// CHECK-DAG: [[IMPLICIT_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 66, i32 0, i32 0, i8*
 // CHECK-DAG: [[SINGLE_BARRIER_LOC:@.+]] = private unnamed_addr constant %{{.+}} { i32 0, i32 322, i32 0, i32 0, i8*
 
 // CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
@@ -215,7 +214,7 @@ int main() {
 // CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
 // CHECK: call void @__kmpc_end_single(
 
-// CHECK: call void @__kmpc_barrier(%{{.+}}* [[IMPLICIT_BARRIER_LOC]], i{{[0-9]+}} [[GTID]])
+// CHECK-NOT: call void @__kmpc_barrier(
 
 // CHECK: = call {{.*}}i{{.+}} [[TMAIN_INT:@.+]]()
 
@@ -224,18 +223,24 @@ int main() {
 // CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
 // CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
 // CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
-// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i32*, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void
+// CHECK: [[T_VARVAL:%.+]] = load i32, i32* [[T_VAR:%.+]],
+// CHECK: [[T_VARCONV:%.+]] = bitcast i64* [[T_VARCAST:%.+]] to i32*
+// CHECK: store i32 [[T_VARVAL]], i32* [[T_VARCONV]],
+// CHECK: [[T_VARPVT:%.+]] = load i64, i64* [[T_VARCAST]],
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_call(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i64, [2 x i32]*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[TMAIN_MICROTASK:@.+]] to void {{.*}}i64 [[T_VARPVT:%.+]],
 // CHECK: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
 // CHECK: ret
 //
-// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: define internal void [[TMAIN_MICROTASK]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i64 {{.*}}%{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[T_VAR_ARG:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
 // CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
 // CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]],
 // CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]],
 // CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
 
-// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK-NOT: load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_ARG]] to i32*
 // CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
 // CHECK: [[S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
 // CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
@@ -245,7 +250,7 @@ int main() {
 // CHECK: call i32 @__kmpc_single(
 
 // firstprivate t_var(t_var)
-// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}*
 // CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]],
 
 // firstprivate vec(vec)
diff --git a/test/OpenMP/single_private_messages.cpp b/test/OpenMP/single_private_messages.cpp
index a24cf47cd2a26..0ed0e6cfb0999 100644
--- a/test/OpenMP/single_private_messages.cpp
+++ b/test/OpenMP/single_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp single private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp single private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp single private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp single private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -102,6 +150,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp single private // expected-error {{expected '(' after 'private'}}
@@ -146,6 +196,8 @@ int main(int argc, char **argv) {
 #pragma omp single private(m) // OK
   foo();
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/target_ast_print.cpp b/test/OpenMP/target_ast_print.cpp
index acf032a221202..e093e29979ba8 100644
--- a/test/OpenMP/target_ast_print.cpp
+++ b/test/OpenMP/target_ast_print.cpp
@@ -25,6 +25,12 @@ T tmain(T argc, T *argv) {
   foo();
 #pragma omp target map(always,alloc: i)
   foo();
+#pragma omp target nowait
+  foo();
+#pragma omp target depend(in : argc, argv[i:argc], a[:])
+  foo();
+#pragma omp target defaultmap(tofrom: scalar)
+  foo();
   return 0;
 }
 
@@ -44,6 +50,12 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 // CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
 // CHECK-NEXT: char i, j, a[20]
 // CHECK-NEXT: #pragma omp target
@@ -60,6 +72,12 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 // CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
 // CHECK-NEXT: T i, j, a[20]
 // CHECK-NEXT: #pragma omp target
@@ -76,6 +94,12 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo()
 // CHECK-NEXT: #pragma omp target map(always,alloc: i)
 // CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
 
 // CHECK-LABEL: int main(int argc, char **argv) {
 int main (int argc, char **argv) {
@@ -115,6 +139,21 @@ int main (int argc, char **argv) {
   foo();
 // CHECK-NEXT: foo();
 
+#pragma omp target nowait
+// CHECK-NEXT: #pragma omp target nowait
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target depend(in : argc, argv[i:argc], a[:])
+// CHECK-NEXT: #pragma omp target depend(in : argc,argv[i:argc],a[:])
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target defaultmap(tofrom: scalar)
+  foo();
+// CHECK-NEXT: foo();
+
   return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
 }
 
diff --git a/test/OpenMP/target_codegen.cpp b/test/OpenMP/target_codegen.cpp
index c2e08d67b843f..f263ebdd2fe34 100644
--- a/test/OpenMP/target_codegen.cpp
+++ b/test/OpenMP/target_codegen.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -33,15 +33,15 @@
 // sizes.
 
 // CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2]
-// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2]
-// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 128, i32 128]
-// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 128, i32 3, i32 128, i32 3, i32 3, i32 128, i32 128, i32 3, i32 3]
+// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 288]
+// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 35, i32 288, i32 35, i32 35, i32 288, i32 288, i32 35, i32 35]
 // CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 128, i32 128, i32 3]
+// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 35]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 128, i32 128, i32 128, i32 3]
-// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 3, i32 128, i32 128, i32 128, i32 3]
+// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 35]
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
diff --git a/test/OpenMP/target_codegen_global_capture.cpp b/test/OpenMP/target_codegen_global_capture.cpp
index 211a3cc884b67..b08bf10f9f29d 100644
--- a/test/OpenMP/target_codegen_global_capture.cpp
+++ b/test/OpenMP/target_codegen_global_capture.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -21,6 +21,11 @@
 // CHECK-DAG: [[BB:@.+]] = internal global float 1.000000e+01
 // CHECK-DAG: [[BC:@.+]] = internal global float 1.100000e+01
 // CHECK-DAG: [[BD:@.+]] = internal global float 1.200000e+01
+// CHECK-DAG: [[TBA:@.+]] = {{.*}}global float 1.700000e+01
+// CHECK-DAG: [[TBB:@.+]] = {{.*}}global float 1.800000e+01
+// CHECK-DAG: [[TBC:@.+]] = {{.*}}global float 1.900000e+01
+// CHECK-DAG: [[TBD:@.+]] = {{.*}}global float 2.000000e+01
+
 double Ga = 1.0;
 double Gb = 2.0;
 double Gc = 3.0;
@@ -42,14 +47,14 @@ int foo(short a, short b, short c, short d){
   static float Sd = 8.0;
 
   // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LB]],
-  // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
-  // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3foossssE2Sb,
-  // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+  // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+  // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[FB]],
+  // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
   // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LC]],
-  // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3foossssE2Sc,
+  // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[FC]],
   // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LD]],
-  // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
-  // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3foossssE2Sd,
+  // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+  // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[FD]],
 
   // 3 local vars being captured.
 
@@ -178,14 +183,156 @@ int bar(short a, short b, short c, short d){
   #pragma omp parallel
   {
     // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LLB]],
-    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
-    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3barssssE2Sb,
-    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[BB]],
+    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
+    // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LLC]],
+    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[BC]],
+    // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LLD]],
+    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[BD]],
+
+    // 3 local vars being captured.
+
+    // CHECK-DAG: store i16 [[VALLB]], i16* [[CONVLB:%.+]],
+    // CHECK-DAG: [[CONVLB]] = bitcast i[[sz:64|32]]* [[CADDRLB:%.+]] to i16*
+    // CHECK-DAG: [[CVALLB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLB]],
+    // CHECK-DAG: [[CPTRLB:%.+]] = inttoptr i[[sz]] [[CVALLB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLB]], i8** [[GEPLB:%.+]],
+    // CHECK-DAG: [[GEPLB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLC]], i16* [[CONVLC:%.+]],
+    // CHECK-DAG: [[CONVLC]] = bitcast i[[sz]]* [[CADDRLC:%.+]] to i16*
+    // CHECK-DAG: [[CVALLC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLC]],
+    // CHECK-DAG: [[CPTRLC:%.+]] = inttoptr i[[sz]] [[CVALLC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLC]], i8** [[GEPLC:%.+]],
+    // CHECK-DAG: [[GEPLC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLD]], i16* [[CONVLD:%.+]],
+    // CHECK-DAG: [[CONVLD]] = bitcast i[[sz]]* [[CADDRLD:%.+]] to i16*
+    // CHECK-DAG: [[CVALLD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLD]],
+    // CHECK-DAG: [[CPTRLD:%.+]] = inttoptr i[[sz]] [[CVALLD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLD]], i8** [[GEPLD:%.+]],
+    // CHECK-DAG: [[GEPLD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static vars being captured.
+
+    // CHECK-DAG: store float [[VALFB]], float* [[CONVFB:%.+]],
+    // CHECK-DAG: [[CONVFB]] = bitcast i[[sz]]* [[CADDRFB:%.+]] to float*
+    // CHECK-DAG: [[CVALFB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFB]],
+    // CHECK-DAG: [[CPTRFB:%.+]] = inttoptr i[[sz]] [[CVALFB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFB]], i8** [[GEPFB:%.+]],
+    // CHECK-DAG: [[GEPFB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFC]], float* [[CONVFC:%.+]],
+    // CHECK-DAG: [[CONVFC]] = bitcast i[[sz]]* [[CADDRFC:%.+]] to float*
+    // CHECK-DAG: [[CVALFC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFC]],
+    // CHECK-DAG: [[CPTRFC:%.+]] = inttoptr i[[sz]] [[CVALFC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFC]], i8** [[GEPFC:%.+]],
+    // CHECK-DAG: [[GEPFC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFD]], float* [[CONVFD:%.+]],
+    // CHECK-DAG: [[CONVFD]] = bitcast i[[sz]]* [[CADDRFD:%.+]] to float*
+    // CHECK-DAG: [[CVALFD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFD]],
+    // CHECK-DAG: [[CPTRFD:%.+]] = inttoptr i[[sz]] [[CVALFD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFD]], i8** [[GEPFD:%.+]],
+    // CHECK-DAG: [[GEPFD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static global vars being captured.
+
+    // CHECK-64-DAG: store double [[VALGB]], double* [[CONVGB:%.+]],
+    // CHECK-64-DAG: [[CONVGB]] = bitcast i[[sz]]* [[CADDRGB:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGB]],
+    // CHECK-64-DAG: [[CPTRGB:%.+]] = inttoptr i[[sz]] [[CVALGB]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGB]], i8** [[GEPGB:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gb to i8*), i8** [[GEPGB:%.+]],
+    // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGC]], double* [[CONVGC:%.+]],
+    // CHECK-64-DAG: [[CONVGC]] = bitcast i[[sz]]* [[CADDRGC:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGC]],
+    // CHECK-64-DAG: [[CPTRGC:%.+]] = inttoptr i[[sz]] [[CVALGC]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGC]], i8** [[GEPGC:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gc to i8*), i8** [[GEPGC:%.+]],
+    // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGD]], double* [[CONVGD:%.+]],
+    // CHECK-64-DAG: [[CONVGD]] = bitcast i[[sz]]* [[CADDRGD:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGD]],
+    // CHECK-64-DAG: [[CPTRGD:%.+]] = inttoptr i[[sz]] [[CVALGD]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGD]], i8** [[GEPGD:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gd to i8*), i8** [[GEPGD:%.+]],
+    // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK: call i32 @__tgt_target
+    // CHECK: call void [[OFFLOADF:@.+]](
+    // Capture b, Gb, Sb, Gc, c, Sc, d, Gd, Sd
+    #pragma omp target if(Ga>0.0 && a>0 && Sa>0.0)
+    {
+      b += 1;
+      Gb += 1.0;
+      Sb += 1.0;
+
+      // CHECK: define internal void [[OFFLOADF]]({{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}})
+      // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}})
+
+      // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}})
+      // Capture d, Gd, Sd
+      #pragma omp parallel if(Gc>0.0 && c>0 && Sc>0.0)
+      {
+        d += 1;
+        Gd += 1.0;
+        Sd += 1.0;
+      }
+    }
+  }
+  return a + b + c + d + (int)Sa + (int)Sb + (int)Sc + (int)Sd;
+}
+
+///
+/// Tests with template functions.
+///
+
+// CHECK: define {{.*}} @{{.*}}tbar2{{.*}}(
+
+// CHECK: define {{.*}} @{{.*}}tbar{{.*}}(
+// CHECK-SAME: i16 {{[^,]*}}[[A:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[B:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[C:%[^,]+]],
+// CHECK-SAME: i16 {{[^,]*}}[[D:%[^,]+]])
+// CHECK: [[LA:%.+]] = alloca i16
+// CHECK: [[LB:%.+]] = alloca i16
+// CHECK: [[LC:%.+]] = alloca i16
+// CHECK: [[LD:%.+]] = alloca i16
+template<typename T>
+int tbar(T a, T b, T c, T d){
+  static float Sa = 17.0;
+  static float Sb = 18.0;
+  static float Sc = 19.0;
+  static float Sd = 20.0;
+
+  // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}}), i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}})
+  // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, i16* dereferenceable(2) [[A:%.+]], i16* dereferenceable(2) [[B:%.+]], i16* dereferenceable(2) [[C:%.+]], i16* dereferenceable(2) [[D:%.+]])
+  // Capture a, b, c, d
+  // CHECK: [[ALLOCLA:%.+]] = alloca i16
+  // CHECK: [[ALLOCLB:%.+]] = alloca i16
+  // CHECK: [[ALLOCLC:%.+]] = alloca i16
+  // CHECK: [[ALLOCLD:%.+]] = alloca i16
+  // CHECK: [[LLA:%.+]] = load i16*, i16** [[ALLOCLA]],
+  // CHECK: [[LLB:%.+]] = load i16*, i16** [[ALLOCLB]],
+  // CHECK: [[LLC:%.+]] = load i16*, i16** [[ALLOCLC]],
+  // CHECK: [[LLD:%.+]] = load i16*, i16** [[ALLOCLD]],
+  #pragma omp parallel
+  {
+    // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LLB]],
+    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* [[GB]],
+    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* [[TBB]],
+    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* [[GC]],
     // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LLC]],
-    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3barssssE2Sc,
+    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* [[TBC]],
     // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LLD]],
-    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
-    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3barssssE2Sd,
+    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* [[GD]],
+    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* [[TBD]],
 
     // 3 local vars being captured.
 
@@ -284,4 +431,8 @@ int bar(short a, short b, short c, short d){
   return a + b + c + d + (int)Sa + (int)Sb + (int)Sc + (int)Sd;
 }
 
+int tbar2(short a, short b, short c, short d){
+  return tbar(a, b, c, d);
+}
+
 #endif
diff --git a/test/OpenMP/target_codegen_registration.cpp b/test/OpenMP/target_codegen_registration.cpp
index 7d515bb64d907..a440faff9158c 100644
--- a/test/OpenMP/target_codegen_registration.cpp
+++ b/test/OpenMP/target_codegen_registration.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
 
 // Check that no target code is emmitted if no omptests flag was provided.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=CHECK-NTARGET
@@ -61,45 +61,45 @@
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // TCHECK-NOT: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 // CHECK-DAG: {{@.+}} = private constant i8 0
 // CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i[[SZ]]] [i[[SZ]] 4]
-// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 128]
+// CHECK-DAG: {{@.+}} = private unnamed_addr constant [1 x i32] [i32 288]
 
 // CHECK-NTARGET-NOT: private constant i8 0
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
-// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:\.omp_offloading\.[0-9a-f]+\.[0-9a-f]+\._Z.+\.l[0-9]+\.c[0-9]+]]\00"
+// CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
 // CHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
 // CHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
@@ -124,7 +124,7 @@
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
 // CHECK-DAG: [[ENTRY12:@.+]] = constant [[ENTTY]] { i8* @{{.*}}, i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR12]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 
-// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:\.omp_offloading\.[0-9a-f]+\.[0-9a-f]+\._Z.+\.l[0-9]+\.c[0-9]+]]\00"
+// TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
 // TCHECK-DAG: [[ENTRY1:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR1]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
 // TCHECK-DAG: [[ENTRY2:@.+]] = constant [[ENTTY]] { i8* bitcast (void (i[[SZ]])* @{{.*}} to i8*), i8* getelementptr inbounds ([{{.*}} x i8], [{{.*}} x i8]* [[NAMEPTR2]], i32 0, i32 0), i[[SZ]] 0 }, section ".omp_offloading.entries", align 1
@@ -407,31 +407,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 11, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 13, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 11, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 13, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 13, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 193, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 243, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 259, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 265, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 402, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 288, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 282, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 288, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 276, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 218, i32 {{[0-9]+}}}
 
 #endif
diff --git a/test/OpenMP/target_codegen_registration_naming.cpp b/test/OpenMP/target_codegen_registration_naming.cpp
index ab7a469aba4d0..ce133eee03e80 100644
--- a/test/OpenMP/target_codegen_registration_naming.cpp
+++ b/test/OpenMP/target_codegen_registration_naming.cpp
@@ -1,20 +1,20 @@
 // Test host codegen.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 
 // Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -omptargets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -omp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -omptargets=i386-pc-linux-gnu -fopenmp-is-device -omp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s -check-prefix=TCHECK
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -check-prefix=TCHECK
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -24,7 +24,7 @@
 
 // CHECK: define {{.*}}i32 @[[NNAME:.+]](i32 {{.*}}%{{.+}})
 int nested(int a){
-  // CHECK: call void @.omp_offloading.[[FILEID:[0-9a-f]+\.[0-9a-f]+]].[[NNAME]].l[[T1L:[0-9]+]].c[[T1C:[0-9]+]](
+  // CHECK: call void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME]]_l[[T1L:[0-9]+]](
   #pragma omp target
     ++a;
 
@@ -42,25 +42,25 @@ int nested(int a){
   return a;
 }
 
-// CHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T1L]].c[[T1C]](
-// TCHECK: define {{.*}}void @.omp_offloading.[[FILEID:[0-9a-f]+\.[0-9a-f]+]].[[NNAME:.+]].l[[T1L:[0-9]+]].c[[T1C:[0-9]+]](
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T1L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID:[0-9a-f]+_[0-9a-f]+]]_[[NNAME:.+]]_l[[T1L:[0-9]+]](
 
 // CHECK: define {{.*}}void @"[[LNAME]]"(
 // CHECK: call void {{.*}}@__kmpc_fork_call{{.+}}[[PNAME:@.+]] to
 
 // CHECK: define {{.*}}void [[PNAME]](
-// CHECK: call void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T2L:[0-9]+]].c[[T2C:[0-9]+]](
+// CHECK: call void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L:[0-9]+]](
 
-// CHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME]].l[[T2L]].c[[T2C]](
-// TCHECK: define {{.*}}void @.omp_offloading.[[FILEID]].[[NNAME:.+]].l[[T2L:[0-9]+]].c[[T2C:[0-9]+]](
+// CHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME]]_l[[T2L]](
+// TCHECK: define {{.*}}void @__omp_offloading_[[FILEID]]_[[NNAME:.+]]_l[[T2L:[0-9]+]](
 
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 [[T1C]], i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 [[T2C]], i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 [[T1C]], i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 [[T2C]], i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T1L]], i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 {{-?[0-9]+}}, i32 {{-?[0-9]+}}, !"[[NNAME]]", i32 [[T2L]], i32 {{[0-9]+}}}
 #endif
diff --git a/test/OpenMP/target_data_ast_print.cpp b/test/OpenMP/target_data_ast_print.cpp
index cdff857e569ad..ed7a96541196f 100644
--- a/test/OpenMP/target_data_ast_print.cpp
+++ b/test/OpenMP/target_data_ast_print.cpp
@@ -12,13 +12,13 @@ template <typename T, int C>
 T tmain(T argc, T *argv) {
   T i, j, b, c, d, e, x[20];
 
-#pragma omp target data
+#pragma omp target data map(to: c)
   i = argc;
 
-#pragma omp target data if (target data: j > 0)
+#pragma omp target data map(to: c) if (target data: j > 0)
   foo();
 
-#pragma omp target data if (b)
+#pragma omp target data map(to: c) if (b)
   foo();
 
 #pragma omp target data map(c)
@@ -48,11 +48,11 @@ T tmain(T argc, T *argv) {
 
 // CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
 // CHECK-NEXT: int i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -70,11 +70,11 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo();
 // CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
 // CHECK-NEXT: char i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -92,11 +92,11 @@ T tmain(T argc, T *argv) {
 // CHECK-NEXT: foo();
 // CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
 // CHECK-NEXT: T i, j, b, c, d, e, x[20];
-// CHECK-NEXT: #pragma omp target data
+// CHECK-NEXT: #pragma omp target data map(to: c)
 // CHECK-NEXT: i = argc;
-// CHECK-NEXT: #pragma omp target data if(target data: j > 0)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(target data: j > 0)
 // CHECK-NEXT: foo();
-// CHECK-NEXT: #pragma omp target data if(b)
+// CHECK-NEXT: #pragma omp target data map(to: c) if(b)
 // CHECK-NEXT: foo();
 // CHECK-NEXT: #pragma omp target data map(tofrom: c)
 // CHECK-NEXT: foo();
@@ -118,17 +118,17 @@ int main (int argc, char **argv) {
   static int a;
 // CHECK: static int a;
 
-#pragma omp target data
-// CHECK:      #pragma omp target data
+#pragma omp target data map(to: c)
+// CHECK:      #pragma omp target data map(to: c)
   a=2;
 // CHECK-NEXT: a = 2;
-#pragma omp target data if (target data: b)
-// CHECK: #pragma omp target data if(target data: b)
+#pragma omp target data map(to: c) if (target data: b)
+// CHECK: #pragma omp target data map(to: c) if(target data: b)
   foo();
 // CHECK-NEXT: foo();
 
-#pragma omp target data if (b > g)
-// CHECK: #pragma omp target data if(b > g)
+#pragma omp target data map(to: c) if (b > g)
+// CHECK: #pragma omp target data map(to: c) if(b > g)
   foo();
 // CHECK-NEXT: foo();
 
diff --git a/test/OpenMP/target_data_codegen.cpp b/test/OpenMP/target_data_codegen.cpp
new file mode 100644
index 0000000000000..a149ba9332b93
--- /dev/null
+++ b/test/OpenMP/target_data_codegen.cpp
@@ -0,0 +1,248 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 37]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  #pragma omp target data if(1+3-5) device(arg) map(from: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target data map(la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_end(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  // CK1: br label %[[IFEND:[^,]+]]
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  #pragma omp target data map(to: arg) if(arg) device(4)
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S]]
+  #pragma omp target data map(always, to: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+  #pragma omp target data map(to: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target data map(always, to: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 37, i32 21]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P]]
+// CK2: br label %[[IFEND:[^,]+]]
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_data_device_messages.cpp b/test/OpenMP/target_data_device_messages.cpp
index 9e8e31a28f5b6..9ed7a54b4399a 100644
--- a/test/OpenMP/target_data_device_messages.cpp
+++ b/test/OpenMP/target_data_device_messages.cpp
@@ -10,18 +10,19 @@ bool foobool(int argc) {
 struct S1; // expected-note {{declared here}}
 
 int main(int argc, char **argv) {
-  #pragma omp target data device // expected-error {{expected '(' after 'device'}}
-  #pragma omp target data device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data device () // expected-error {{expected expression}}
-  #pragma omp target data device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
-#pragma omp target data device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
-  #pragma omp target data device (argc + argc)
-  #pragma omp target data device (argc), device (argc+1) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'device' clause}}
-  #pragma omp target data device (S1) // expected-error {{'S1' does not refer to a value}}
-  #pragma omp target data device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
-  #pragma omp target device (-10u)
-  #pragma omp target device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  int a;
+  #pragma omp target data map(to: a) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target data map(to: a) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) device () // expected-error {{expected expression}}
+  #pragma omp target data map(to: a) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+#pragma omp target data map(to: a) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target data map(to: a) device (argc + argc)
+  #pragma omp target data map(to: a) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'device' clause}}
+  #pragma omp target data map(to: a) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target data map(to: a) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target data map(to: a) device (-10u)
+  #pragma omp target data map(to: a) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
   foo();
 
   return 0;
diff --git a/test/OpenMP/target_data_if_messages.cpp b/test/OpenMP/target_data_if_messages.cpp
index 77edefa48b8a4..ec6fe26921538 100644
--- a/test/OpenMP/target_data_if_messages.cpp
+++ b/test/OpenMP/target_data_if_messages.cpp
@@ -10,22 +10,23 @@ bool foobool(int argc) {
 struct S1; // expected-note {{declared here}}
 
 int main(int argc, char **argv) {
-  #pragma omp target data if // expected-error {{expected '(' after 'if'}}
-  #pragma omp target data if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if () // expected-error {{expected expression}}
-  #pragma omp target data if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
-  #pragma omp target data if (argc > 0 ? argv[1] : argv[2])
-  #pragma omp target data if (argc + argc)
-  #pragma omp target data if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause}}
-  #pragma omp target data if (S1) // expected-error {{'S1' does not refer to a value}}
-  #pragma omp target data if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
-  #pragma omp target data if(target data : argc)
-  #pragma omp target data if(target data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target data'}}
-  #pragma omp target data if(target data : argc) if (target data:argc) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause with 'target data' name modifier}}
-  #pragma omp target data if(target data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  int a;
+  #pragma omp target data map(to: a) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target data map(to: a) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if () // expected-error {{expected expression}}
+  #pragma omp target data map(to: a) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+  #pragma omp target data map(to: a) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target data map(to: a) if (argc + argc)
+  #pragma omp target data map(to: a) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause}}
+  #pragma omp target data map(to: a) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target data map(to: a) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target data map(to: a) if(target data : argc)
+  #pragma omp target data map(to: a) if(target data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target data'}}
+  #pragma omp target data map(to: a) if(target data : argc) if (target data:argc) // expected-error {{directive '#pragma omp target data' cannot contain more than one 'if' clause with 'target data' name modifier}}
+  #pragma omp target data map(to: a) if(target data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
   return 0;
diff --git a/test/OpenMP/target_data_messages.c b/test/OpenMP/target_data_messages.c
index cd60d85a9030b..153b4377290ca 100644
--- a/test/OpenMP/target_data_messages.c
+++ b/test/OpenMP/target_data_messages.c
@@ -3,19 +3,22 @@
 void foo() { }
 
 int main(int argc, char **argv) {
+  int a;
+  #pragma omp target data // expected-error {{expected at least one map clause for '#pragma omp target data'}}
+  {}
   L1:
     foo();
-  #pragma omp target data
+  #pragma omp target data map(a)
   {
     foo();
     goto L1; // expected-error {{use of undeclared label 'L1'}}
   }
   goto L2; // expected-error {{use of undeclared label 'L2'}}
-  #pragma omp target data
+  #pragma omp target data map(a)
   L2:
   foo();
 
-  #pragma omp target data(i) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
+  #pragma omp target data map(a)(i) // expected-warning {{extra tokens at the end of '#pragma omp target data' are ignored}}
   {
     foo();
   }
diff --git a/test/OpenMP/target_data_use_device_ptr_ast_print.cpp b/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
new file mode 100644
index 0000000000000..4e3253b2798fd
--- /dev/null
+++ b/test/OpenMP/target_data_use_device_ptr_ast_print.cpp
@@ -0,0 +1,154 @@
+// RxUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct ST {
+  int *a;
+};
+struct SA {
+  int i, j;
+  int *k = &j;
+  int *&z = k;
+  void func(int arg) {
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+    {}
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+    {}
+  return;
+ }
+};
+// CHECK: struct SA
+// CHECK: void func(
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->k)
+// CHECK: #pragma omp target data map(tofrom: this->i) use_device_ptr(this->z)
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef int from;
+
+template <typename T>
+T tmain(T argc) {
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+  {}
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+  {}
+  return 0;
+}
+
+// CHECK: template <typename T = int> int tmain(int argc) {
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+
+// CHECK: template <typename T = int *> int *tmain(int *argc) {
+// CHECK-NEXT: int *i;
+// CHECK-NEXT: int *&j = i;
+// CHECK-NEXT: int **k = &j;
+// CHECK-NEXT: int **&z = k;
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main(int argc, char **argv) {
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+#pragma omp target data map(tofrom: i) use_device_ptr(k)
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(k)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target data map(tofrom: i) use_device_ptr(z)
+// CHECK-NEXT: #pragma omp target data map(tofrom: i) use_device_ptr(z)
+  {}
+  return tmain<int>(argc) + (*tmain<int*>(&argc));
+}
+
+#endif
diff --git a/test/OpenMP/target_data_use_device_ptr_messages.cpp b/test/OpenMP/target_data_use_device_ptr_messages.cpp
new file mode 100644
index 0000000000000..1d8002c1ff603
--- /dev/null
+++ b/test/OpenMP/target_data_use_device_ptr_messages.cpp
@@ -0,0 +1,206 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fopenmp -ferror-limit 200 %s
+struct ST {
+  int *a;
+};
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  void func(int arg) {
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+    {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+    {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    {}
+#pragma omp target data map(i) use_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+    {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+    {}
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef int from;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+#pragma omp target data map(i) use_device_ptr // expected-error {{expected '(' after 'use_device_ptr'}}
+  {}
+#pragma omp target data map(i) use_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target data map(i) use_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target data map(i) use_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(i) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(j) // expected-error {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(z) // OK
+  {}
+#pragma omp target data map(i) use_device_ptr(aa) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(e) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(g) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(k,i,j) // expected-error2 {{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(d) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+#pragma omp target data map(i) use_device_ptr(da) // expected-error{{expected pointer or reference to pointer in 'use_device_ptr' clause}}
+  {}
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_defaultmap_messages.cpp b/test/OpenMP/target_defaultmap_messages.cpp
new file mode 100644
index 0000000000000..59348d42c5253
--- /dev/null
+++ b/test/OpenMP/target_defaultmap_messages.cpp
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_depend_messages.cpp b/test/OpenMP/target_depend_messages.cpp
new file mode 100644
index 0000000000000..c13a3edaf699a
--- /dev/null
+++ b/test/OpenMP/target_depend_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+
+  #pragma omp target depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : argv[0])
+  foo();
+  #pragma omp target depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target depend(in : arr[0])
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_device_messages.cpp b/test/OpenMP/target_device_messages.cpp
index fb0f2defa42bc..3befcd690810b 100644
--- a/test/OpenMP/target_device_messages.cpp
+++ b/test/OpenMP/target_device_messages.cpp
@@ -11,16 +11,27 @@ struct S1; // expected-note {{declared here}}
 
 int main(int argc, char **argv) {
   #pragma omp target device // expected-error {{expected '(' after 'device'}}
+  foo();
   #pragma omp target device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target device () // expected-error {{expected expression}}
+  foo();
   #pragma omp target device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
-#pragma omp target device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
   #pragma omp target device (argc + argc)
+  foo();
   #pragma omp target device (argc), device (argc+1) // expected-error {{directive '#pragma omp target' cannot contain more than one 'device' clause}}
+  foo();
   #pragma omp target device (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
   #pragma omp target device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  foo();
   #pragma omp target device (-10u)
+  foo();
   #pragma omp target device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
   foo();
 
diff --git a/test/OpenMP/target_enter_data_ast_print.cpp b/test/OpenMP/target_enter_data_ast_print.cpp
new file mode 100644
index 0000000000000..10ec925284998
--- /dev/null
+++ b/test/OpenMP/target_enter_data_ast_print.cpp
@@ -0,0 +1,228 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T i, j, b, c, d, e, x[20];
+
+  i = argc;
+#pragma omp target enter data map(to: i)
+
+#pragma omp target enter data map(to: i) if (target enter data: j > 0)
+
+#pragma omp target enter data map(to: i) if (b)
+
+#pragma omp target enter data map(to: c)
+
+#pragma omp target enter data map(to: c) if(b>e)
+
+#pragma omp target enter data map(alloc: x[0:10], c)
+
+#pragma omp target enter data map(to: c) map(alloc: d)
+
+#pragma omp target enter data map(always,alloc: e)
+
+#pragma omp target enter data nowait map(to: i)
+
+#pragma omp target enter data nowait map(to: i) if (target enter data: j > 0)
+
+#pragma omp target enter data map(to: i) if (b) nowait
+
+#pragma omp target enter data map(to: c) nowait
+
+#pragma omp target enter data map(to: c) nowait if(b>e)
+
+#pragma omp target enter data nowait map(alloc: x[0:10], c)
+
+#pragma omp target enter data nowait map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e)
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: i)
+
+#pragma omp target enter data nowait map(to: i) if (target enter data: j > 0) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) map(to: i) if (b) nowait
+
+#pragma omp target enter data map(to: c) depend(in : argc, argv[i:argc], x[:]) nowait
+
+#pragma omp target enter data map(to: c) nowait if(b>e) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data nowait map(alloc: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e) depend(in : argc, argv[i:argc], x[:])
+
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target enter data map(to: i)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) if(b > e)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0)
+// CHECK-NEXT: #pragma omp target enter data map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: i)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: i) if(target enter data: j > 0) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(to: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait if(b > e) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(alloc: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+
+int main (int argc, char **argv) {
+  int b = argc, i, c, d, e, f, g, x[20];
+  static int a;
+// CHECK: static int a;
+
+#pragma omp target enter data map(to: a)
+// CHECK:      #pragma omp target enter data map(to: a)
+  a=2;
+// CHECK-NEXT: a = 2;
+#pragma omp target enter data map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) if (b > g)
+// CHECK: #pragma omp target enter data map(to: a) if(b > g)
+
+#pragma omp target enter data map(to: c)
+// CHECK-NEXT: #pragma omp target enter data map(to: c)
+
+#pragma omp target enter data map(alloc: c) if(b>g)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: c) if(b > g)
+
+#pragma omp target enter data map(to: x[0:10], c)
+// CHECK-NEXT: #pragma omp target enter data map(to: x[0:10],c)
+
+#pragma omp target enter data map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data map(to: c) map(alloc: d)
+
+#pragma omp target enter data map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data map(always,alloc: e)
+
+#pragma omp target enter data nowait map(to: a)
+// CHECK:      #pragma omp target enter data nowait map(to: a)
+
+#pragma omp target enter data nowait map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data nowait map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) if (b > g) nowait
+// CHECK: #pragma omp target enter data map(to: a) if(b > g) nowait
+
+#pragma omp target enter data map(to: c) nowait
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait
+
+#pragma omp target enter data map(alloc: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target enter data map(alloc: c) nowait if(b > g)
+
+#pragma omp target enter data nowait map(to: x[0:10], c)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: x[0:10],c)
+
+#pragma omp target enter data nowait map(to: c) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e)
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e)
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) nowait map(to: a)
+// CHECK:      #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) nowait map(to: a)
+
+#pragma omp target enter data nowait depend(in : argc, argv[i:argc], x[:]) map(to: a) if (target enter data: b)
+// CHECK: #pragma omp target enter data nowait depend(in : argc,argv[i:argc],x[:]) map(to: a) if(target enter data: b)
+
+#pragma omp target enter data map(to: a) depend(in : argc, argv[i:argc], x[:]) if (b > g) nowait
+// CHECK: #pragma omp target enter data map(to: a) depend(in : argc,argv[i:argc],x[:]) if(b > g) nowait
+
+#pragma omp target enter data map(to: c) nowait depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data map(to: c) nowait depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target enter data depend(in : argc, argv[i:argc], x[:]) map(alloc: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target enter data depend(in : argc,argv[i:argc],x[:]) map(alloc: c) nowait if(b > g)
+
+#pragma omp target enter data nowait map(to: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target enter data nowait map(to: c) depend(in : argc, argv[i:argc], x[:]) map(alloc: d)
+// CHECK-NEXT: #pragma omp target enter data nowait map(to: c) depend(in : argc,argv[i:argc],x[:]) map(alloc: d)
+
+#pragma omp target enter data nowait map(always,alloc: e) depend(in : argc, argv[i:argc], x[:])
+// CHECK-NEXT: #pragma omp target enter data nowait map(always,alloc: e) depend(in : argc,argv[i:argc],x[:])
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/test/OpenMP/target_enter_data_codegen.cpp b/test/OpenMP/target_enter_data_codegen.cpp
new file mode 100644
index 0000000000000..152cd46b4a2ff
--- /dev/null
+++ b/test/OpenMP/target_enter_data_codegen.cpp
@@ -0,0 +1,249 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 32]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 37]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data if(1+3-5) device(arg) map(alloc: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target enter data map(to: la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(always, to: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_begin(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target enter data map(to: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target enter data map(always, to: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 37, i32 21]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_begin(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+// CK2-NOT: __tgt_target_data_end
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+#ifdef CK4
+
+// CK4-LABEL: device_side_scan
+void device_side_scan(int arg) {
+  // CK4: tgt_target_data_begin
+  // CK4: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK4: ret
+  // TCK4-NOT: tgt_target_data_begin
+  #pragma omp target enter data map(to: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_enter_data_depend_messages.cpp b/test/OpenMP/target_enter_data_depend_messages.cpp
new file mode 100644
index 0000000000000..84254b5c312a9
--- /dev/null
+++ b/test/OpenMP/target_enter_data_depend_messages.cpp
@@ -0,0 +1,166 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target enter data map(to: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : tmain) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : arr[0])
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target enter data map(to: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0])
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target enter data map(to: i) depend(in : arr[0])
+  foo();
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_enter_data_device_messages.cpp b/test/OpenMP/target_enter_data_device_messages.cpp
new file mode 100644
index 0000000000000..d954eca319dd2
--- /dev/null
+++ b/test/OpenMP/target_enter_data_device_messages.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target enter data map(to: i) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target enter data map(to: i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) device () // expected-error {{expected expression}}
+  #pragma omp target enter data map(to: i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+#pragma omp target enter data map(to: i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target enter data map(to: i) device (argc + argc)
+  #pragma omp target enter data map(to: i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'device' clause}}
+  #pragma omp target enter data map(to: i) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target enter data map(to: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target enter data map(to: i) device (-10u)
+  #pragma omp target enter data map(to: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_if_messages.cpp b/test/OpenMP/target_enter_data_if_messages.cpp
new file mode 100644
index 0000000000000..0d18af187b19e
--- /dev/null
+++ b/test/OpenMP/target_enter_data_if_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target enter data map(to: i) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target enter data map(to: i) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if () // expected-error {{expected expression}}
+  #pragma omp target enter data map(to: i) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target enter data map(to: i) if (argc + argc)
+  #pragma omp target enter data map(to: i) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'if' clause}}
+  #pragma omp target enter data map(to: i) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target enter data map(to: i) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target data : true) // expected-error {{directive name modifier 'target data' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) if(target enter data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc)
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (target enter data:argc) // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'if' clause with 'target enter data' name modifier}}
+  #pragma omp target enter data map(to: i) if(target enter data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_map_messages.c b/test/OpenMP/target_enter_data_map_messages.c
new file mode 100644
index 0000000000000..6f5aad1ef6504
--- /dev/null
+++ b/test/OpenMP/target_enter_data_map_messages.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - -x c++ %s
+
+int main(int argc, char **argv) {
+
+  int r;
+  #pragma omp target enter data // expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+
+  #pragma omp target enter data map(r) // expected-error {{map type must be specified for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target enter data'}}
+
+  #pragma omp target enter data map(always, to: r)
+  #pragma omp target enter data map(always, alloc: r)
+  #pragma omp target enter data map(always, from: r) // expected-error {{map type 'from' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(release: r) // expected-error {{map type 'release' is not allowed for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(delete: r) // expected-error {{map type 'delete' is not allowed for '#pragma omp target enter data'}}
+
+  return 0;
+}
diff --git a/test/OpenMP/target_enter_data_nowait_messages.cpp b/test/OpenMP/target_enter_data_nowait_messages.cpp
new file mode 100644
index 0000000000000..e682e8c47d703
--- /dev/null
+++ b/test/OpenMP/target_enter_data_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target enter data map(to: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait enter data map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  #pragma omp target enter nowait data map(to: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data map(to: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait device (-10u)
+  #pragma omp target enter data map(to: i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
+  #pragma omp target enter data map(to: i) nowait nowait // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'nowait' clause}}
+  #pragma omp target enter data nowait map(to: i) nowait // expected-error {{directive '#pragma omp target enter data' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_ast_print.cpp b/test/OpenMP/target_exit_data_ast_print.cpp
new file mode 100644
index 0000000000000..e2c6d7fd65ceb
--- /dev/null
+++ b/test/OpenMP/target_exit_data_ast_print.cpp
@@ -0,0 +1,244 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T i, j, b, c, d, e, x[20];
+
+  i = argc;
+#pragma omp target exit data map(from: i)
+
+#pragma omp target exit data map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) if (b)
+
+#pragma omp target exit data map(from: c)
+
+#pragma omp target exit data map(from: c) if(b>e)
+
+#pragma omp target exit data map(release: x[0:10], c)
+
+#pragma omp target exit data map(delete: x[0:10])
+
+#pragma omp target exit data map(always, delete: x[0:10])
+
+#pragma omp target exit data map(from: c) map(release: d)
+
+#pragma omp target exit data map(always,release: e)
+
+#pragma omp target exit data nowait map(from: i)
+
+#pragma omp target exit data nowait map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) if (b) nowait
+
+#pragma omp target exit data map(from: c) nowait
+
+#pragma omp target exit data map(from: c) nowait if(b>e)
+
+#pragma omp target exit data nowait map(release: x[0:10], c)
+
+#pragma omp target exit data nowait map(from: c) map(release: d)
+
+#pragma omp target exit data nowait map(always,release: e)
+
+#pragma omp target exit data depend(in : argc, argv[i:argc], x[:]) nowait map(from: i)
+
+#pragma omp target exit data nowait depend(in : argc, argv[i:argc], x[:]) map(from: i) if (target exit data: j > 0)
+
+#pragma omp target exit data map(from: i) depend(in : argc, argv[i:argc], x[:]) if (b) nowait
+
+#pragma omp target exit data map(from: c) depend(in : argc, argv[i:argc], x[:]) nowait
+
+#pragma omp target exit data map(from: c) depend(in : argc, argv[i:argc], x[:]) nowait if(b>e)
+
+#pragma omp target exit data nowait map(release: x[0:10], c) depend(in : argc, argv[i:argc], x[:])
+
+#pragma omp target exit data nowait map(from: c) depend(in : argc, argv[i:argc], x[:]) map(release: d)
+
+#pragma omp target exit data depend(in : argc, argv[i:argc], x[:]) nowait map(always,release: e)
+
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T i, j, b, c, d, e, x[20];
+// CHECK-NEXT: i = argc;
+// CHECK-NEXT: #pragma omp target exit data map(from: i)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) if(b > e)
+// CHECK-NEXT: #pragma omp target exit data map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(from: i)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: i) if(target exit data: j > 0)
+// CHECK-NEXT: #pragma omp target exit data map(from: i) depend(in : argc,argv[i:argc],x[:]) if(b) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait if(b > e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(release: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) nowait map(always,release: e)
+
+int main (int argc, char **argv) {
+  int b = argc, i, c, d, e, f, g, x[20];
+  static int a;
+// CHECK: static int a;
+
+#pragma omp target exit data map(from: a)
+// CHECK:      #pragma omp target exit data map(from: a)
+  a=2;
+// CHECK-NEXT: a = 2;
+#pragma omp target exit data map(from: a) if (target exit data: b)
+// CHECK: #pragma omp target exit data map(from: a) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g)
+// CHECK: #pragma omp target exit data map(from: a) if(b > g)
+
+#pragma omp target exit data map(from: c)
+// CHECK-NEXT: #pragma omp target exit data map(from: c)
+
+#pragma omp target exit data map(release: c) if(b>g)
+// CHECK-NEXT: #pragma omp target exit data map(release: c) if(b > g)
+
+#pragma omp target exit data map(from: x[0:10], c)
+// CHECK-NEXT: #pragma omp target exit data map(from: x[0:10],c)
+
+#pragma omp target exit data map(delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(delete: x[0:10])
+
+#pragma omp target exit data map(always, delete: x[0:10])
+// CHECK-NEXT: #pragma omp target exit data map(always,delete: x[0:10])
+
+#pragma omp target exit data map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data map(from: c) map(release: d)
+
+#pragma omp target exit data map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data map(always,release: e)
+
+#pragma omp target exit data nowait map(from: a)
+// CHECK:      #pragma omp target exit data nowait map(from: a)
+
+#pragma omp target exit data nowait map(from: a) if (target exit data: b)
+// CHECK: #pragma omp target exit data nowait map(from: a) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g) nowait
+// CHECK: #pragma omp target exit data map(from: a) if(b > g) nowait
+
+#pragma omp target exit data map(from: c) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) nowait
+
+#pragma omp target exit data map(release: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target exit data map(release: c) nowait if(b > g)
+
+#pragma omp target exit data nowait map(from: x[0:10], c)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: x[0:10],c)
+
+#pragma omp target exit data nowait map(from: c) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) map(release: d)
+
+#pragma omp target exit data nowait map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait map(always,release: e)
+
+#pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: a)
+// CHECK:      #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(from: a)
+
+#pragma omp target exit data nowait map(from: a) depend(in : argc,argv[i:argc],x[:]) if (target exit data: b)
+// CHECK: #pragma omp target exit data nowait map(from: a) depend(in : argc,argv[i:argc],x[:]) if(target exit data: b)
+
+#pragma omp target exit data map(from: a) if (b > g) nowait depend(in : argc,argv[i:argc],x[:])
+// CHECK: #pragma omp target exit data map(from: a) if(b > g) nowait depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+// CHECK-NEXT: #pragma omp target exit data map(from: c) depend(in : argc,argv[i:argc],x[:]) nowait
+
+#pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) map(release: c) nowait if(b>g)
+// CHECK-NEXT: #pragma omp target exit data depend(in : argc,argv[i:argc],x[:]) map(release: c) nowait if(b > g)
+
+#pragma omp target exit data nowait map(from: x[0:10], c) depend(in : argc,argv[i:argc],x[:])
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: x[0:10],c) depend(in : argc,argv[i:argc],x[:])
+
+#pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+// CHECK-NEXT: #pragma omp target exit data nowait map(from: c) depend(in : argc,argv[i:argc],x[:]) map(release: d)
+
+#pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(always,release: e)
+// CHECK-NEXT: #pragma omp target exit data nowait depend(in : argc,argv[i:argc],x[:]) map(always,release: e)
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+#endif
diff --git a/test/OpenMP/target_exit_data_codegen.cpp b/test/OpenMP/target_exit_data_codegen.cpp
new file mode 100644
index 0000000000000..d3a38592a6103
--- /dev/null
+++ b/test/OpenMP/target_exit_data_codegen.cpp
@@ -0,0 +1,221 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 32]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 38]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 32, i32 16]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data if(1+3-5) device(arg) map(from: gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_end(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(always, from: lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-NOT: __tgt_target_data_begin
+  // CK1-DAG: call void @__tgt_target_data_end(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target exit data map(release: gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target exit data map(always, release: b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 36, i32 20]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2-NOT: __tgt_target_data_begin
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_end(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_begin
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3-NOT: tgt_target_data_end
+  // CK3: ret
+  #pragma omp target exit data map(from: arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_exit_data_depend_messages.cpp b/test/OpenMP/target_exit_data_depend_messages.cpp
new file mode 100644
index 0000000000000..4c59b6205f188
--- /dev/null
+++ b/test/OpenMP/target_exit_data_depend_messages.cpp
@@ -0,0 +1,166 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target exit data map(from: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : tmain) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : arr[0])
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+
+  int i;
+  #pragma omp target exit data map(from: i) depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0])
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target exit data map(from: i) depend(in : arr[0])
+  foo();
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_exit_data_device_messages.cpp b/test/OpenMP/target_exit_data_device_messages.cpp
new file mode 100644
index 0000000000000..d9ce0ddd7ea00
--- /dev/null
+++ b/test/OpenMP/target_exit_data_device_messages.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target exit data map(from: i) device // expected-error {{expected '(' after 'device'}}
+  #pragma omp target exit data map(from: i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) device () // expected-error {{expected expression}}
+  #pragma omp target exit data map(from: i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+#pragma omp target exit data map(from: i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target exit data map(from: i) device (argc + argc)
+  #pragma omp target exit data map(from: i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'device' clause}}
+  #pragma omp target exit data map(from: i) device (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target exit data map(from: i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  #pragma omp target exit data map(from: i) device (-10u)
+  #pragma omp target exit data map(from: i) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_if_messages.cpp b/test/OpenMP/target_exit_data_if_messages.cpp
new file mode 100644
index 0000000000000..cc674e6933cc3
--- /dev/null
+++ b/test/OpenMP/target_exit_data_if_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target exit data map(from: i) if // expected-error {{expected '(' after 'if'}}
+  #pragma omp target exit data map(from: i) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if () // expected-error {{expected expression}}
+  #pragma omp target exit data map(from: i) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) if (argc > 0 ? argv[1] : argv[2])
+  #pragma omp target exit data map(from: i) if (argc + argc)
+  #pragma omp target exit data map(from: i) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'if' clause}}
+  #pragma omp target exit data map(from: i) if (S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target exit data map(from: i) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target data : true) // expected-error {{directive name modifier 'target data' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) if(target exit data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc)
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (target exit data:argc) // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'if' clause with 'target exit data' name modifier}}
+  #pragma omp target exit data map(from: i) if(target exit data : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_map_messages.c b/test/OpenMP/target_exit_data_map_messages.c
new file mode 100644
index 0000000000000..a9953fbbb4c6f
--- /dev/null
+++ b/test/OpenMP/target_exit_data_map_messages.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - -x c++ %s
+
+int main(int argc, char **argv) {
+
+  int r;
+  #pragma omp target exit data // expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+
+  #pragma omp target exit data map(r) // expected-error {{map type must be specified for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target exit data'}}
+
+  #pragma omp target exit data map(always, from: r)
+  #pragma omp target exit data map(delete: r)
+  #pragma omp target exit data map(release: r)
+  #pragma omp target exit data map(always, alloc: r) // expected-error {{map type 'alloc' is not allowed for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(to: r) // expected-error {{map type 'to' is not allowed for '#pragma omp target exit data'}}
+
+  return 0;
+}
diff --git a/test/OpenMP/target_exit_data_nowait_messages.cpp b/test/OpenMP/target_exit_data_nowait_messages.cpp
new file mode 100644
index 0000000000000..cd743d89278ef
--- /dev/null
+++ b/test/OpenMP/target_exit_data_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target exit data map(from: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait exit data map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  #pragma omp target exit nowait data map(from: i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data map(from: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait device (-10u)
+  #pragma omp target exit data map(from: i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
+  #pragma omp target exit data map(from: i) nowait nowait // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'nowait' clause}}
+  #pragma omp target exit data nowait map(from: i) nowait // expected-error {{directive '#pragma omp target exit data' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_firstprivate_codegen.cpp b/test/OpenMP/target_firstprivate_codegen.cpp
new file mode 100644
index 0000000000000..ca459e02a4256
--- /dev/null
+++ b/test/OpenMP/target_firstprivate_codegen.cpp
@@ -0,0 +1,580 @@
+// Test host codegen.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// CHECK:  [[TT:%.+]] = type { i64, i8 }
+// CHECK:  [[S1:%.+]] = type { double }
+
+// TCHECK:  [[TT:%.+]] = type { i64, i8 }
+// TCHECK:  [[S1:%.+]] = type { double }
+
+// CHECK-DAG:  [[SIZET:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 4]
+// CHECK:  [[MAPT:@.+]] = private unnamed_addr constant [1 x i32] [i32 288]
+// CHECK-DAG:  [[MAPT2:@.+]] = private unnamed_addr constant [9 x i32] [i32 288, i32 161, i32 288, i32 161, i32 161, i32 288, i32 288, i32 161, i32 161]
+// CHECK-DAG:  [[SIZET3:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] zeroinitializer
+// CHECK-DAG:  [[MAPT3:@.+]] = private unnamed_addr constant [1 x i32] [i32 32]
+// CHECK-DAG:  [[MAPT4:@.+]] = private unnamed_addr constant [5 x i32] [i32 35, i32 288, i32 288, i32 288, i32 161]
+// CHECK-DAG:  [[SIZET5:@.+]] = private unnamed_addr constant [3 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 1, i[[SZ]] 40]
+// CHECK-DAG:  [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 288, i32 288, i32 161]
+// CHECK-DAG:  [[SIZET6:@.+]] = private unnamed_addr constant [2 x i{{32|64}}] [i[[SZ]] 4, i[[SZ]] 40]
+// CHECK-DAG:  [[MAPT6:@.+]] = private unnamed_addr constant [2 x i32] [i32 288, i32 161]
+
+
+// CHECK: define {{.*}}[[FOO:@.+]](
+int foo(int n, double *ptr) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+  
+  #pragma omp target firstprivate(a)
+  {
+  }
+
+  // a is passed by value to tgt_target
+  // CHECK:  [[N_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // CHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[B:%.+]] = alloca [10 x float],
+  // CHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // CHECK:  [[C:%.+]] = alloca [5 x [10 x double]],
+  // CHECK:  [[D:%.+]] = alloca [[TT]],
+  // CHECK:  [[ACAST:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  {{.+}} = alloca i{{[0-9]+}},
+  // CHECK:  [[BASE_PTR_ARR:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[PTR_ARR:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[A2CAST:%.+]] = alloca i{{[0-9]+}},
+  // CHECK:  [[BASE_PTR_ARR2:%.+]] = alloca [9 x i8*],
+  // CHECK:  [[PTR_ARR2:%.+]] = alloca [9 x i8*],
+  // CHECK:  [[SIZET2:%.+]] = alloca [9 x i{{[0-9]+}}],
+  // CHECK:  [[BASE_PTR_ARR3:%.+]] = alloca [1 x i8*],
+  // CHECK:  [[PTR_ARR3:%.+]] = alloca [1 x i8*],  
+  // CHECK:  [[N_ADDR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
+  // CHECK-64:  [[N_EXT:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL]] to i{{[0-9]+}}
+  // CHECK:  [[SSAVE_RET:%.+]] = call i8* @llvm.stacksave()
+  // CHECK:  store i8* [[SSAVE_RET]], i8** [[SSTACK]],
+  // CHECK-64:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_EXT]],
+  // CHECK-32:  [[BN_VLA:%.+]] = alloca float, i{{[0-9]+}} [[N_ADDR_VAL]],  
+  // CHECK:  [[N_ADDR_VAL2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[N_ADDR]],
+  // CHECK-64:  [[N_EXT2:%.+]] = zext i{{[0-9]+}} [[N_ADDR_VAL2]] to i{{[0-9]+}}
+  // CHECK-64:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
+  // CHECK-32:  [[CN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
+  // CHECK:  [[CN_VLA:%.+]] = alloca double, i{{[0-9]+}} [[CN_SIZE]],
+  // CHECK:  [[AVAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A]],
+  // CHECK-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ACAST]] to i{{[0-9]+}}*
+  // CHECK-64:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[CONV]],
+  // CHECK-32:  store i{{[0-9]+}} [[AVAL]], i{{[0-9]+}}* [[ACAST]],
+  // CHECK:  [[ACAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ACAST]],
+  // CHECK:  [[ACAST_TOPTR:%.+]] = inttoptr i{{[0-9]+}} [[ACAST_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[ACAST_TOPTR]], i8** [[BASE_PTR_GEP]],
+  // CHECK:  [[ACAST_TOPTR2:%.+]] = inttoptr i{{[0-9]+}} [[ACAST_VAL]] to i8*
+  // CHECK:  [[PTR_GEP:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[ACAST_TOPTR2]], i8** [[PTR_GEP]],
+  // CHECK:  [[BASE_PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG]], i8** [[PTR_GEP_ARG]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT]], i32 0, i32 0))
+  
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]])
+  // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+  // TCHECK-NOT: store i{{[0-9]+}} %
+  // TCHECK:  ret void  
+
+#pragma omp target firstprivate(aa,b,bn,c,cn,d)
+  {
+    aa += 1;
+    b[2] = 1.0;
+    bn[3] = 1.0;
+    c[1][2] = 1.0;
+    cn[1][3] = 1.0;
+    d.X = 1;
+    d.Y = 1;    
+  }
+
+  // CHECK:  [[A2VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2]],
+  // CHECK:  [[A2CASTCONV:%.+]] = bitcast i{{[0-9]+}}* [[A2CAST]] to i{{[0-9]+}}*
+  // CHECK:  store i{{[0-9]+}} [[A2VAL]], i{{[0-9]+}}* [[A2CASTCONV]],
+  // CHECK:  [[A2CAST_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A2CAST]],
+  // CHECK-64:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_EXT]], 4
+  // CHECK-32:  [[BN_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[N_ADDR_VAL]], 4  
+  // CHECK-64:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_EXT2]]
+  // CHECK-32:  [[CN_SIZE_1:%.+]] = mul{{.+}} i{{[0-9]+}} 5, [[N_ADDR_VAL2]]
+  // CHECK:  [[CN_SIZE_2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SIZE_1]], 8
+
+  // firstprivate(aa) --> base_ptr = aa, ptr = aa, size = 2 (short)
+  // CHECK:  [[A2CAST_TO_INT:%.+]] = inttoptr i{{[0-9]+}} [[A2CAST_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A2CAST_TO_INT]], i8** [[BASE_PTR_GEP2_0]],
+  // CHECK:  [[A2CAST_TO_INT_2:%.+]] = inttoptr i{{[0-9]+}} [[A2CAST_VAL]] to i8*
+  // CHECK:  [[PTR_GEP2_0:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A2CAST_TO_INT_2]], i8** [[PTR_GEP2_0]],
+  // CHECK:  [[SIZE_GEPA2:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIZE_GEPA2]],
+
+  // firstprivate(b): base_ptr = &b[0], ptr = &b[0], size = 40 (sizeof(float)*10)
+  // CHECK:  [[BCAST:%.+]] = bitcast [10 x float]* [[B]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[BCAST]], i8** [[BASE_PTR_GEP2_1]],
+  // CHECK:  [[BCAST2:%.+]] = bitcast [10 x float]* [[B]] to i8*
+  // CHECK:  [[PTR_GEP2_1:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[BCAST2]], i8** [[PTR_GEP2_1]],
+  // CHECK:  [[SIZE_GEPB:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i{{[0-9]+}} 40, i{{[0-9]+}}* [[SIZE_GEPB]],
+
+  // firstprivate(bn), 2 entries, n and bn: (1) base_ptr = n, ptr = n, size = 8 ; (2) base_ptr = &c[0], ptr = &c[0], size = n*sizeof(float)
+  // CHECK-64:  [[N_EXT3_1:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT]] to i8*
+  // CHECK-32:  [[N_EXT3_1:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[N_EXT3_1]], i8** [[BASE_PTR_GEP2_2]],
+  // CHECK-64:  [[N_EXT3_2:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT]] to i8*
+  // CHECK-32:  [[N_EXT3_2:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL]] to i8*
+  // CHECK:  [[PTR_GEP2_2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[N_EXT3_2]], i8** [[PTR_GEP2_2]],
+  // CHECK:  [[SIZE_GEPBN_1:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPBN_1]],
+  // CHECK:  [[VLABN_BCAST:%.+]] = bitcast float* [[BN_VLA]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_3:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[VLABN_BCAST]], i8** [[BASE_PTR_GEP2_3]],
+  // CHECK: [[SIZE_GEPBN_3:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i{{[0-9]+}} [[BN_SIZE]], i{{[0-9]+}}* [[SIZE_GEPBN_3]]
+  
+  // firstprivate(c): base_ptr = &c[0], ptr = &c[0], size = 400 (5*10*sizeof(double))
+  // CHECK:  [[C_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[C_BCAST]], i8** [[BASE_PTR_GEP2_4]],
+  // CHECK:  [[C_BCAST2:%.+]] = bitcast [5 x [10 x double]]* [[C]] to i8*
+  // CHECK:  [[PTR_GEP2_4:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[C_BCAST2]], i8** [[PTR_GEP2_4]],
+  // CHECK:  [[SIZE_GEPC_4:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i{{[0-9]+}} 400, i{{[0-9]+}}* [[SIZE_GEPC_4]],
+  
+  // firstprivate(cn), 3 entries, 5, n, cn: (1) base_ptr = 5, ptr = 5, size = 8; (2) (1) base_ptr = n, ptr = n, size = 8; (3) base_ptr = &cn[0], ptr = &cn[0], size = 5*n*sizeof(double)
+  // CHECK:  [[BASE_PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 5 to i8*), i8** [[BASE_PTR_GEP2_5]],
+  // CHECK:  [[PTR_GEP2_5:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 5 to i8*), i8** [[PTR_GEP2_5]],
+  // CHECK:  [[SIZE_GEPCN_5:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 5
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPCN_5]],
+  // CHECK-64:  [[CN_SZ_2_1:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT2]] to i8*
+  // CHECK-32:  [[CN_SZ_2_1:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL2]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i8* [[CN_SZ_2_1]], i8** [[BASE_PTR_GEP2_6]],
+  // CHECK-64:  [[CN_SZ_2_2:%.+]] = inttoptr i{{[0-9]+}} [[N_EXT2]] to i8*
+  // CHECK-32:  [[CN_SZ_2_2:%.+]] = inttoptr i{{[0-9]+}} [[N_ADDR_VAL2]] to i8*
+  // CHECK:  [[PTR_GEP2_6:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i8* [[CN_SZ_2_2]], i8** [[PTR_GEP2_6]],
+  // CHECK:  [[SIZE_GEPCN_6:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 6
+  // CHECK:  store i{{[0-9]+}} {{[0-9]}}, i{{[0-9]+}}* [[SIZE_GEPCN_6]],
+  // CHECK:  [[VLA_CN_BCAST:%.+]] = bitcast double* [[CN_VLA]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i8* [[VLA_CN_BCAST]], i8** [[BASE_PTR_GEP2_7]],
+  // CHECK:  [[VLA_CN_BCAST2:%.+]] = bitcast double* [[CN_VLA]] to i8*
+  // CHECK:  [[PTR_GEP2_7:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i8* [[VLA_CN_BCAST2]], i8** [[PTR_GEP2_7]],
+  // CHECK:  [[SIZE_GEPCN_7:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 7
+  // CHECK:  store i{{[0-9]+}} [[CN_SIZE_2]], i{{[0-9]+}}* [[SIZE_GEPCN_7]],
+  
+  // firstprivate(d): base_ptr = &d, ptr = &d, size = 16 
+  // CHECK:  [[D_REF:%.+]] = bitcast [[TT]]* [[D]] to i8*
+  // CHECK:  [[BASE_PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i8* [[D_REF]], i8** [[BASE_PTR_GEP2_8]],
+  // CHECK:  [[D_REF2:%.+]] = bitcast [[TT]]* [[D]] to i8*
+  // CHECK:  [[PTR_GEP2_8:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i8* [[D_REF2]], i8** [[PTR_GEP2_8]],
+  // CHECK:  [[SIZE_GEPCN_8:%.+]] = getelementptr inbounds [9 x i{{[0-9]+}}], [9 x i{{[0-9]+}}]* [[SIZET2]], i{{[0-9]+}} 0, i{{[0-9]+}} 8
+  // CHECK:  store i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}}* [[SIZE_GEPCN_8]],
+  
+  
+  // CHECK:  [[BASE_PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[BASE_PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG2:%.+]] = getelementptr inbounds [9 x i8*], [9 x i8*]* [[PTR_ARR2]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[SIZES_ARG2:%.+]] = getelementptr inbounds [9 x i[[SZ]]], [9 x i[[SZ]]]* [[SIZET2]],  i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK: {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 9, i8** [[BASE_PTR_GEP_ARG2]], i8** [[PTR_GEP_ARG2]], i[[SZ]]* [[SIZES_ARG2]], i32* getelementptr inbounds ([9 x i32], [9 x i32]* [[MAPT2]], i32 0, i32 0))
+  
+  // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A2_IN:%.+]], [10 x float]* {{.+}} [[B_IN:%.+]], i{{[0-9]+}} [[BN_SZ:%.+]], float* {{.+}} [[BN_IN:%.+]], [5 x [10 x double]]* {{.+}} [[C_IN:%.+]], i{{[0-9]+}} [[CN_SZ1:%.+]], i{{[0-9]+}} [[CN_SZ2:%.+]], double* {{.+}} [[CN_IN:%.+]], [[TT]]* {{.+}} [[D_IN:%.+]])
+  // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B_ADDR:%.+]] = alloca [10 x float]*,
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[BN_ADDR:%.+]] = alloca float*,
+  // TCHECK:  [[C_ADDR:%.+]] = alloca [5 x [10 x double]]*,
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[CN_ADDR:%.+]] = alloca double*,
+  // TCHECK:  [[D_ADDR:%.+]] = alloca [[TT]]*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[B_PRIV:%.+]] = alloca [10 x float],
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK:  [[C_PRIV:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D_PRIV:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[A2_IN]], i{{[0-9]+}}* [[A2_ADDR]],
+  // TCHECK:  store [10 x float]* [[B_IN]], [10 x float]** [[B_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[BN_SZ]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store float* [[BN_IN]], float** [[BN_ADDR]],
+  // TCHECK:  store [5 x [10 x double]]* [[C_IN]], [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[CN_SZ1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}} [[CN_SZ2]], i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  store double* [[CN_IN]], double** [[CN_ADDR]],
+  // TCHECK:  store [[TT]]* [[D_IN]], [[TT]]** [[D_ADDR]],
+  // TCHECK:  [[CONV_A2ADDR:%.+]] = bitcast i{{[0-9]+}}* [[A2_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x float]*, [10 x float]** [[B_ADDR]],
+  // TCHECK:  [[BN_SZ_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[BN_ADDR_REF:%.+]] = load float*, float** [[BN_ADDR]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[C_ADDR]],
+  // TCHECK:  [[CN_SZ1_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[CN_SZ2_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[CN_ADDR_REF:%.+]] = load double*, double** [[CN_ADDR]],
+  // TCHECK:  [[D_ADDR_REF:%.+]] = load [[TT]]*, [[TT]]** [[D_ADDR]],
+
+  // firstprivate(aa): a_priv = a_in
+  // TCHECK-NOT:  store i{{[0-9]+}} %
+
+  //  firstprivate(b): memcpy(b_priv,b_in)
+  // TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x float]* [[B_PRIV]] to i8*
+  // TCHECK:  [[B_ADDR_REF_BCAST:%.+]] = bitcast [10 x float]* [[B_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_ADDR_REF_BCAST]], {{.+}})
+
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
+
+  // firstprivate(bn)
+  // TCHECK:  [[BN_PRIV:%.+]] = alloca float, i{{[0-9]+}} [[BN_SZ_VAL]],
+  // TCHECK:  [[BN_COPY_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[BN_SZ_VAL]], 4
+  // TCHECK:  [[BN_PRIV__BCAST:%.+]] = bitcast float* [[BN_PRIV]] to i8*
+  // TCHECK:  [[BN_REF_IN_BCAST:%.+]] = bitcast float* [[BN_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[BN_PRIV__BCAST]], i8* [[BN_REF_IN_BCAST]], i{{[0-9]+}} [[BN_COPY_SZ]],{{.+}})
+
+  // firstprivate(c)
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast [5 x [10 x double]]* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+  
+  // firstprivate(cn)
+  // TCHECK:  [[CN_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
+  // TCHECK:  [[CN_PRIV:%.+]] = alloca double, i{{[0-9]+}} [[CN_SZ]],
+  // TCHECK:  [[CN_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ1_VAL]], [[CN_SZ2_VAL]]
+  // TCHECK:  [[CN_SZ2_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[CN_SZ2]], 8
+  // TCHECK:  [[CN_PRIV_BCAST:%.+]] = bitcast double* [[CN_PRIV]] to i8*
+  // TCHECK:  [[CN_IN_BCAST:%.+]] = bitcast double* [[CN_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[CN_PRIV_BCAST]], i8* [[CN_IN_BCAST]], i{{[0-9]+}} [[CN_SZ2_CPY]],{{.+}})
+  
+  // firstprivate(d)
+  // TCHECK:  [[D_PRIV_BCAST:%.+]] = bitcast [[TT]]* [[D_PRIV]] to i8*
+  // TCHECK:  [[D_IN_BCAST:%.+]] = bitcast [[TT]]* [[D_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[D_PRIV_BCAST]], i8* [[D_IN_BCAST]],{{.+}})
+
+  
+  #pragma omp target firstprivate(ptr)
+  {
+    ptr[0]++;
+  }
+  // CHECK:  [[PTR_ADDR_REF:%.+]] = load double*, double** [[PTR_ADDR]],
+  // CHECK:  [[PTR_ADDR_BCAST:%.+]] = bitcast double* [[PTR_ADDR_REF]] to i8*
+
+  // CHECK:  [[BASE_PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[PTR_ADDR_BCAST]], i8** [[BASE_PTR_GEP3_0]],
+  // CHECK:  [[PTR_ADDR_BCAST2:%.+]] = bitcast double* [[PTR_ADDR_REF]] to i8*
+  // CHECK:  [[PTR_GEP3_0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[PTR_ADDR_BCAST2]], i8** [[PTR_GEP3_0]],
+
+  // CHECK:  [[BASE_PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[BASE_PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  [[PTR_GEP_ARG3:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PTR_ARR3]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK: {{.+}} = call i32 @__tgt_target(i32 -1, {{.+}}, i32 1, i8** [[BASE_PTR_GEP_ARG3]], i8** [[PTR_GEP_ARG3]], i[[SZ]]* getelementptr inbounds ([1 x i[[SZ]]], [1 x i[[SZ]]]* [[SIZET3]], i32 0, i32 0), i32* getelementptr inbounds ([1 x i32], [1 x i32]* [[MAPT3]], i32 0, i32 0))
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}(double* [[PTR_IN:%.+]])
+  // TCHECK:  [[PTR_ADDR:%.+]] = alloca double*,
+  // TCHECK-NOT: alloca double*,
+  // TCHECK:  store double* [[PTR_IN]], double** [[PTR_ADDR]],
+  // TCHECK-NOT: store double* %
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  tx b[10];
+
+#pragma omp target firstprivate(a,b)
+  {
+    a += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target firstprivate(a,aaa,b)
+  {
+    a += 1;
+    aaa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[A3_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store i{{[0-9]+}} [[A3_IN]], i{{[0-9]+}}* [[A3_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[A3_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A3_ADDR]] to i8*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a): a_priv = a_in
+
+// firstprivate(aaa)
+// TCHECK-NOT:  store i{{[0-9]+}} %
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+#pragma omp target firstprivate(b,c)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+
+  // on the host side, we first generate r1, then the static function and the template above
+  // CHECK:  define{{.+}} i32 {{.+}}([[S1]]* {{.+}}, i{{[0-9]+}} {{.+}})
+  // CHECK:  [[BASE_PTRS4:%.+]] = alloca [5 x i8*],
+  // CHECK:  [[PTRS4:%.+]] = alloca [5 x i8*],
+  // CHECK:  [[SIZET4:%.+]] = alloca [5 x i{{[0-9]+}}],
+
+  // map(this: this ptr is implicitly captured (not firstprivate matter)
+  // CHECK:  {{.+}} = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}},
+  // CHECK:  {{.+}} = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}},
+  // CHECK:  {{.+}} getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store {{.+}}, {{.+}}
+
+  // firstprivate(b): base_ptr = b, ptr = b, size = 4 (pass by-value)
+  // CHECK:  [[B_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[B_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[B_CAST_PTR]], i8** [[BASE_PTRS_GEP4_1]],
+  // CHECK:  [[B_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[B_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[B_CAST_PTR2]], i8** [[PTRS_GEP4_1]],
+  // CHECK:  [[SIZES_GEP4_1:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_1]],
+
+  // firstprivate(c), 3 entries: 2, n, c
+  // CHECK:  [[BASE_PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 2 to i8*), i8** [[BASE_PTRS_GEP4_2]],
+  // CHECK:  [[PTRS_GEP4_2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* inttoptr (i{{[0-9]+}} 2 to i8*), i8** [[PTRS_GEP4_2]],
+  // CHECK:  [[SIZES_GEP4_2:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK-64:  store i{{[0-9]+}} 8, i{{[0-9]+}}* [[SIZES_GEP4_2]],
+  // CHECK-32:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_2]],
+  // CHECK:  [[N_PTR:%.+]] = inttoptr i{{[0-9]+}} [[N:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[N_PTR]], i8** [[BASE_PTRS_GEP4_3]],
+  // CHECK:  [[N_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[N:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK:  store i8* [[N_PTR2]], i8** [[PTRS_GEP4_3]],
+  // CHECK:  [[SIZES_GEP4_3:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // CHECK-64:  store i{{[0-9]+}} 8, i{{[0-9]+}}* [[SIZES_GEP4_3]],
+  // CHECK-32:  store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIZES_GEP4_3]],
+  // CHECK:  [[B_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[B:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BASE_PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP4_4]],
+  // CHECK:  [[B_BCAST2:%.+]] = bitcast i{{[0-9]+}}* [[B:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP4_4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP4_4]],
+  // CHECK:  [[SIZES_GEP4_4:%.+]] = getelementptr inbounds [5 x i{{[0-9]+}}], [5 x i{{[0-9]+}}]* [[SIZET4]], i{{[0-9]+}} 0, i{{[0-9]+}} 4
+  // CHECK:  store i{{[0-9]+}} [[B_SIZE:%.+]], i{{[0-9]+}}* [[SIZES_GEP4_4]],
+
+  // only check that we use the map types stored in the global variable
+  // CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 5, i8** {{.+}}, i8** {{.+}}, i{{[0-9]+}}* {{.+}}, i32* getelementptr inbounds ([5 x i32], [5 x i32]* [[MAPT4]], i32 0, i32 0))
+  
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]], i{{[0-9]+}}{{.+}} [[C_IN:%.+]])
+  // TCHECK:  [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[C_ADDR:%.+]] = alloca i{{[0-9]+}}*,
+  // TCHECK-NOT: alloca i{{[0-9]+}},
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+
+  // TCHECK:  store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[B_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}}* [[C_IN]], i{{[0-9]+}}** [[C_ADDR]],
+  // TCHECK:  [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK-64:  [[B_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[B_ADDR]] to i{{[0-9]+}}*
+  // TCHECK:  [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[C_ADDR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_ADDR]],
+
+  // firstprivate(b)
+  // TCHECK-NOT:  store i{{[0-9]+}} %
+ 
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
+
+  // firstprivate(c)
+  // TCHECK:  [[C_SZ:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK:  [[C_PRIV:%.+]] = alloca i{{[0-9]+}}, i{{[0-9]+}} [[C_SZ]],
+  // TCHECK:  [[C_SZ2:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK:  [[C_SZ_CPY:%.+]] = mul{{.+}} i{{[0-9]+}} [[C_SZ2]],  2
+  // TCHECK:  [[C_PRIV_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_PRIV]] to i8*
+  // TCHECK:  [[C_IN_BCAST:%.+]] = bitcast i{{[0-9]+}}* [[C_ADDR_REF]] to i8*
+  // TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[C_PRIV_BCAST]], i8* [[C_IN_BCAST]],{{.+}})
+
+  // finish
+  // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // TCHECK: ret void
+
+
+  // static host function
+  // CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
+  // CHECK:  [[BASE_PTRS5:%.+]] = alloca [3 x i8*],
+  // CHECK:  [[PTRS5:%.+]] = alloca [3 x i8*],
+
+  // firstprivate(a): by value
+  // CHECK:  [[A_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[A_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A_CAST_PTR]], i8** [[BASE_PTRS_GEP5_0]],
+  // CHECK:  [[A_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[A_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // CHECK:  store i8* [[A_CAST_PTR2]], i8** [[PTRS_GEP5_0]],
+
+  // firstprivate(aaa): by value
+  // CHECK:  [[A3_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[A3_CAST:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[A3_CAST_PTR]], i8** [[BASE_PTRS_GEP5_1]],
+  // CHECK:  [[A3_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[A3_CAST:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // CHECK:  store i8* [[A3_CAST_PTR2]], i8** [[PTRS_GEP5_1]],
+
+  // firstprivate(b): base_ptr = &b[0], ptr= &b[0]
+  // CHECK:  [[B_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+  // CHECK:  [[BASE_PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BASE_PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP5_2]],
+  // CHECK:  [[B_BCAST2:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+  // CHECK:  [[PTRS_GEP5_2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[PTRS5]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP5_2]],
+
+  // only check that the right sizes and map types are used
+  // CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 3, i8** {{.+}}, i8** {{.+}}, i[[SZ]]* getelementptr inbounds ([3 x i[[SZ]]], [3 x i[[SZ]]]* [[SIZET5]], i32 0, i32 0), i32* getelementptr inbounds ([3 x i32], [3 x i32]* [[MAPT5]], i32 0, i32 0))
+};
+
+
+
+int bar(int n, double *ptr){
+  int a = 0;
+  a += foo(n, ptr);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template host and device
+
+// CHECK:  define{{.+}} i32 {{.+}}(i{{[0-9]+}} {{.+}})
+// CHECK:  [[BASE_PTRS6:%.+]] = alloca [2 x i8*],
+// CHECK:  [[PTRS6:%.+]] = alloca [2 x i8*],
+
+// firstprivate(a): by value
+// CHECK:  [[AT_CAST_PTR:%.+]] = inttoptr i{{[0-9]+}} [[AT_CAST:%.+]] to i8*
+// CHECK:  [[BASE_PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK:  store i8* [[AT_CAST_PTR]], i8** [[BASE_PTRS_GEP6_0]],
+// CHECK:  [[AT_CAST_PTR2:%.+]] = inttoptr i{{[0-9]+}} [[AT_CAST:%.+]] to i8*
+// CHECK:  [[PTRS_GEP6_0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK:  store i8* [[AT_CAST_PTR2]], i8** [[PTRS_GEP6_0]],
+
+// firstprivate(b): pointer
+// CHECK:  [[B_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+// CHECK:  [[BASE_PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BASE_PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK:  store i8* [[B_BCAST]], i8** [[BASE_PTRS_GEP6_1]],
+// CHECK:  [[B_BCAST2:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B:%.+]] to i8*
+// CHECK:  [[PTRS_GEP6_1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[PTRS6]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK:  store i8* [[B_BCAST2]], i8** [[PTRS_GEP6_1]],
+
+// CHECK:  call i32 @__tgt_target(i32 -1, {{.+}}, i32 2, i8** {{.+}}, i8** {{.+}}, i[[SZ]]* getelementptr inbounds ([2 x i[[SZ]]], [2 x i[[SZ]]]* [[SIZET6]], i32 0, i32 0), i32* getelementptr inbounds ([2 x i32], [2 x i32]* [[MAPT6]], i32 0, i32 0))
+
+
+// TCHECK: define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[A_IN:%.+]], [10 x i{{[0-9]+}}]*{{.+}} [[B_IN:%.+]])
+// TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B_ADDR:%.+]] = alloca [10 x i{{[0-9]+}}]*,
+// TCHECK-NOT: alloca i{{[0-9]+}},
+// TCHECK:  [[B_PRIV:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[A_ADDR]],
+// TCHECK:  store [10 x i{{[0-9]+}}]* [[B_IN]], [10 x i{{[0-9]+}}]** [[B_ADDR]],
+// TCHECK-64:  [[A_ADDR_CONV:%.+]] = bitcast i{{[0-9]+}}* [[A_ADDR]] to i{{[0-9]+}}*
+// TCHECK:  [[B_ADDR_REF:%.+]] = load [10 x i{{[0-9]+}}]*, [10 x i{{[0-9]+}}]** [[B_ADDR]],
+
+// firstprivate(a)
+// TCHECK-NOT:  store i{{[0-9]+}} %
+
+// firstprivate(b)
+// TCHECK:  [[B_PRIV_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_PRIV]] to i8*
+// TCHECK:  [[B_IN_BCAST:%.+]] = bitcast [10 x i{{[0-9]+}}]* [[B_ADDR_REF]] to i8*
+// TCHECK:  call void @llvm.memcpy.{{.+}}(i8* [[B_PRIV_BCAST]], i8* [[B_IN_BCAST]],{{.+}})
+
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_firstprivate_messages.cpp b/test/OpenMP/target_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..6dbad671de7c6
--- /dev/null
+++ b/test/OpenMP/target_firstprivate_messages.cpp
@@ -0,0 +1,198 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target firstprivate(a) firstprivate(this->a) firstprivate(s.a) firstprivate(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+{}
+#pragma omp target firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate() // expected-error {{expected expression}}
+{}
+#pragma omp target firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(argc)
+{}
+#pragma omp target firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+{}
+#pragma omp target firstprivate(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(e, g)
+{}
+#pragma omp target firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel firstprivate(i)
+#pragma omp target firstprivate(j)
+{}
+#pragma omp target firstprivate(i)
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target firstprivate(a)
+  {}
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+{}
+#pragma omp target firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate() // expected-error {{expected expression}}
+{}
+#pragma omp target firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(argc)
+{}
+#pragma omp target firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+{}
+#pragma omp target firstprivate(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target firstprivate(e, g)
+{}
+#pragma omp target firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel firstprivate(i)
+#pragma omp target firstprivate(j)
+{}
+#pragma omp target firstprivate(i)
+  {}
+  static int si;
+#pragma omp target firstprivate(si) // OK
+  {}
+#pragma omp target map(i) firstprivate(i) // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}}
+  {}
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_if_messages.cpp b/test/OpenMP/target_if_messages.cpp
index 4ee7302282f6c..189256e60fea2 100644
--- a/test/OpenMP/target_if_messages.cpp
+++ b/test/OpenMP/target_if_messages.cpp
@@ -12,21 +12,37 @@ struct S1; // expected-note {{declared here}}
 template <class T, class S> // expected-note {{declared here}}
 int tmain(T argc, S **argv) {
   #pragma omp target if // expected-error {{expected '(' after 'if'}}
+  foo();
   #pragma omp target if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if () // expected-error {{expected expression}}
+  foo();
   #pragma omp target if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
   #pragma omp target if (argc > 0 ? argv[1] : argv[2])
+  foo();
   #pragma omp target if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause}}
+  foo();
   #pragma omp target if (S) // expected-error {{'S' does not refer to a value}}
+  foo();
   #pragma omp target if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(argc)
+  foo();
   #pragma omp target if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc)
+  foo();
   #pragma omp target if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target'}}
+  foo();
   #pragma omp target if(target : argc) if (target:argc) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
   #pragma omp target if(target : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
@@ -35,22 +51,39 @@ int tmain(T argc, S **argv) {
 
 int main(int argc, char **argv) {
   #pragma omp target if // expected-error {{expected '(' after 'if'}}
+  foo();
   #pragma omp target if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if () // expected-error {{expected expression}}
+  foo();
   #pragma omp target if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
   #pragma omp target if (argc > 0 ? argv[1] : argv[2])
+  foo();
   #pragma omp target if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause}}
+  foo();
   #pragma omp target if (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
   #pragma omp target if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
   #pragma omp target if(target : argc)
+  foo();
   #pragma omp target if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target'}}
+  foo();
   #pragma omp target if(target : argc) if (target:argc) // expected-error {{directive '#pragma omp target' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
   #pragma omp target if(target : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
   foo();
 
diff --git a/test/OpenMP/target_is_device_ptr_ast_print.cpp b/test/OpenMP/target_is_device_ptr_ast_print.cpp
new file mode 100644
index 0000000000000..f519235a0b365
--- /dev/null
+++ b/test/OpenMP/target_is_device_ptr_ast_print.cpp
@@ -0,0 +1,294 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct ST {
+  int *a;
+};
+typedef int arr[10];
+typedef ST STarr[10];
+struct SA {
+  const int da[5] = { 0 };
+  ST g[10];
+  STarr &rg = g;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  arr &raa = aa;
+  void func(int arg) {
+#pragma omp target is_device_ptr(k)
+    {}
+#pragma omp target is_device_ptr(z)
+    {}
+#pragma omp target is_device_ptr(aa) // OK
+    {}
+#pragma omp target is_device_ptr(raa) // OK
+    {}
+#pragma omp target is_device_ptr(g) // OK
+    {}
+#pragma omp target is_device_ptr(rg) // OK
+    {}
+#pragma omp target is_device_ptr(da) // OK
+    {}
+  return;
+ }
+};
+// CHECK: struct SA
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: ST g[10];
+// CHECK-NEXT: STarr &rg = this->g;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = this->i;
+// CHECK-NEXT: int *k = &this->j;
+// CHECK-NEXT: int *&z = this->k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: arr &raa = this->aa;
+// CHECK-NEXT: func(
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->g)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->rg)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(this->da)
+
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef struct {
+  int a;
+} S6;
+
+template <typename T>
+T tmain(T argc) {
+  const T da[5] = { 0 };
+  S6 h[10];
+  auto &rh = h;
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+  auto &raa = aa;
+#pragma omp target is_device_ptr(k)
+  {}
+#pragma omp target is_device_ptr(z)
+  {}
+#pragma omp target is_device_ptr(aa)
+  {}
+#pragma omp target is_device_ptr(raa)
+  {}
+#pragma omp target is_device_ptr(h)
+  {}
+#pragma omp target is_device_ptr(rh)
+  {}
+#pragma omp target is_device_ptr(da)
+  {}
+  return 0;
+}
+
+// CHECK: template <typename T = int> int tmain(int argc) {
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: auto &raa = aa;
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+
+// CHECK: template <typename T = int *> int *tmain(int *argc) {
+// CHECK-NEXT: int *const da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int *i;
+// CHECK-NEXT: int *&j = i;
+// CHECK-NEXT: int **k = &j;
+// CHECK-NEXT: int **&z = k;
+// CHECK-NEXT: int *aa[10];
+// CHECK-NEXT: auto &raa = aa;
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main(int argc, char **argv) {
+  const int da[5] = { 0 };
+  S6 h[10];
+  auto &rh = h;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  auto &raa = aa;
+// CHECK-NEXT: const int da[5] = {0};
+// CHECK-NEXT: S6 h[10];
+// CHECK-NEXT: auto &rh = h;
+// CHECK-NEXT: int i;
+// CHECK-NEXT: int &j = i;
+// CHECK-NEXT: int *k = &j;
+// CHECK-NEXT: int *&z = k;
+// CHECK-NEXT: int aa[10];
+// CHECK-NEXT: auto &raa = aa;
+#pragma omp target is_device_ptr(k)
+// CHECK-NEXT: #pragma omp target is_device_ptr(k)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(z)
+// CHECK-NEXT: #pragma omp target is_device_ptr(z)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(aa)
+// CHECK-NEXT: #pragma omp target is_device_ptr(aa)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(raa)
+// CHECK-NEXT: #pragma omp target is_device_ptr(raa)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(h)
+// CHECK-NEXT: #pragma omp target is_device_ptr(h)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(rh)
+// CHECK-NEXT: #pragma omp target is_device_ptr(rh)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+#pragma omp target is_device_ptr(da)
+// CHECK-NEXT: #pragma omp target is_device_ptr(da)
+  {}
+// CHECK-NEXT: {
+// CHECK-NEXT: }
+  return tmain<int>(argc) + *tmain<int *>(&argc);
+}
+
+
+#endif
diff --git a/test/OpenMP/target_is_device_ptr_messages.cpp b/test/OpenMP/target_is_device_ptr_messages.cpp
new file mode 100644
index 0000000000000..634e16114e1be
--- /dev/null
+++ b/test/OpenMP/target_is_device_ptr_messages.cpp
@@ -0,0 +1,234 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fopenmp -ferror-limit 200 %s
+struct ST {
+  int *a;
+};
+typedef int arr[10];
+typedef ST STarr[10];
+struct SA {
+  const int d = 5;
+  const int da[5] = { 0 };
+  ST e;
+  ST g[10];
+  STarr &rg = g;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  arr &raa = aa;
+  void func(int arg) {
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+    {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+    {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+    {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+    {}
+#pragma omp target is_device_ptr(arg // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(k) // OK
+    {}
+#pragma omp target is_device_ptr(z) // OK
+    {}
+#pragma omp target is_device_ptr(aa) // OK
+    {}
+#pragma omp target is_device_ptr(raa) // OK
+    {}    
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(g) // OK
+    {}
+#pragma omp target is_device_ptr(rg) // OK
+    {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+    {}
+#pragma omp target is_device_ptr(da) // OK
+    {}
+  return;
+ }
+};
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+struct S1;
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h)
+
+typedef struct {
+  int a;
+} S6;
+
+template <typename T, int I>
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  T i;
+  T &j = i;
+  T *k = &j;
+  T *&z = k;
+  T aa[10];
+  auto &raa = aa;
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(k) // OK
+  {}
+#pragma omp target is_device_ptr(z) // OK
+  {}
+#pragma omp target is_device_ptr(aa) // OK
+  {}
+#pragma omp target is_device_ptr(raa) // OK
+  {}
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(h) // OK
+  {}
+#pragma omp target is_device_ptr(rh) // OK
+  {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(da) // OK
+  {}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  S6 h[10];
+  auto &rh = h;
+  int i;
+  int &j = i;
+  int *k = &j;
+  int *&z = k;
+  int aa[10];
+  auto &raa = aa;
+#pragma omp target is_device_ptr // expected-error {{expected '(' after 'is_device_ptr'}}
+  {}
+#pragma omp target is_device_ptr( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr() // expected-error {{expected expression}}
+  {}
+#pragma omp target is_device_ptr(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
+#pragma omp target is_device_ptr(argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(i) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(j) // expected-error {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(k) // OK
+  {}
+#pragma omp target is_device_ptr(z) // OK
+  {}
+#pragma omp target is_device_ptr(aa) // OK
+  {}
+#pragma omp target is_device_ptr(raa) // OK
+  {}
+#pragma omp target is_device_ptr(e) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(g) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(h) // OK
+  {}
+#pragma omp target is_device_ptr(rh) // OK
+  {}
+#pragma omp target is_device_ptr(k,i,j) // expected-error2 {{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(d) // expected-error{{expected pointer, array, reference to pointer, or reference to array in 'is_device_ptr clause'}}
+  {}
+#pragma omp target is_device_ptr(da) // OK
+  {}
+  return tmain<int, 3>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_map_codegen.cpp b/test/OpenMP/target_map_codegen.cpp
index 942cc4c09fd30..626f68d5a2a94 100644
--- a/test/OpenMP/target_map_codegen.cpp
+++ b/test/OpenMP/target_map_codegen.cpp
@@ -7,17 +7,17 @@
 ///
 
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
-// RUN: %clang_cc1 -DCK1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
-// RUN: %clang_cc1 -DCK1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
 #ifdef CK1
 
 // CK1-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK1-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK1-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK1-LABEL: implicit_maps_integer
 void implicit_maps_integer (int a){
@@ -52,17 +52,17 @@ void implicit_maps_integer (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
-// RUN: %clang_cc1 -DCK2 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
-// RUN: %clang_cc1 -DCK2 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
 #ifdef CK2
 
 // CK2-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK2-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK2-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK2-LABEL: implicit_maps_integer_reference
 void implicit_maps_integer_reference (int a){
@@ -101,17 +101,17 @@ void implicit_maps_integer_reference (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
-// RUN: %clang_cc1 -DCK3 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
-// RUN: %clang_cc1 -DCK3 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
-// RUN: %clang_cc1 -DCK3 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
 #ifdef CK3
 
 // CK3-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK3-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK3-LABEL: implicit_maps_parameter
 void implicit_maps_parameter (int a){
@@ -145,17 +145,17 @@ void implicit_maps_parameter (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK4 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
-// RUN: %clang_cc1 -DCK4 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
-// RUN: %clang_cc1 -DCK4 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
-// RUN: %clang_cc1 -DCK4 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
 #ifdef CK4
 
 // CK4-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK4-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK4-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK4-LABEL: implicit_maps_nested_integer
 void implicit_maps_nested_integer (int a){
@@ -201,17 +201,17 @@ void implicit_maps_nested_integer (int a){
 // CK4: define internal void [[KERNELP2]](i32* {{[^,]+}}, i32* {{[^,]+}}, i32* {{[^,]+}})
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK5 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
-// RUN: %clang_cc1 -DCK5 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-64
-// RUN: %clang_cc1 -DCK5 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
-// RUN: %clang_cc1 -DCK5 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK5 --check-prefix CK5-32
 #ifdef CK5
 
 // CK5-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK5-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK5-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK5-LABEL: implicit_maps_nested_integer_and_enum
 void implicit_maps_nested_integer_and_enum (int a){
@@ -252,17 +252,17 @@ void implicit_maps_nested_integer_and_enum (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK6 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK6 --check-prefix CK6-64
-// RUN: %clang_cc1 -DCK6 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-64
-// RUN: %clang_cc1 -DCK6 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
-// RUN: %clang_cc1 -DCK6 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
+// RUN: %clang_cc1 -DCK6 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK6 --check-prefix CK6-64
+// RUN: %clang_cc1 -DCK6 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-64
+// RUN: %clang_cc1 -DCK6 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
+// RUN: %clang_cc1 -DCK6 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK6 --check-prefix CK6-32
 #ifdef CK6
 // CK6-DAG: [[GBL:@Gi]] = global i32 0
 // CK6-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK6-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK6-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK6-LABEL: implicit_maps_host_global
 int Gi;
@@ -298,22 +298,22 @@ void implicit_maps_host_global (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK7 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK7 --check-prefix CK7-64
-// RUN: %clang_cc1 -DCK7 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-64
-// RUN: %clang_cc1 -DCK7 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
-// RUN: %clang_cc1 -DCK7 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
+// RUN: %clang_cc1 -DCK7 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK7 --check-prefix CK7-64
+// RUN: %clang_cc1 -DCK7 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-64
+// RUN: %clang_cc1 -DCK7 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
+// RUN: %clang_cc1 -DCK7 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK7  --check-prefix CK7-32
 #ifdef CK7
 
 // For a 32-bit targets, the value doesn't fit the size of the pointer,
 // therefore it is passed by reference with a map 'to' specification.
 
 // CK7-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 8]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK7-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
-// Map types: OMP_MAP_TO = 1
-// CK7-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK7-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+// Map types: OMP_MAP_TO  | OMP_MAP_IS_FIRST = 33
+// CK7-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK7-LABEL: implicit_maps_double
 void implicit_maps_double (int a){
@@ -360,17 +360,17 @@ void implicit_maps_double (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK8 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK8
-// RUN: %clang_cc1 -DCK8 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK8
+// RUN: %clang_cc1 -DCK8 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK8
 #ifdef CK8
 
 // CK8-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK8-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// Map types: OMP_MAP_PRIVATE_VAL | OMP_MAP_IS_FIRST = 288
+// CK8-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 // CK8-LABEL: implicit_maps_float
 void implicit_maps_float (int a){
@@ -404,17 +404,17 @@ void implicit_maps_float (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK9 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK9
-// RUN: %clang_cc1 -DCK9 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK9
+// RUN: %clang_cc1 -DCK9 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK9
 #ifdef CK9
 
 // CK9-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 16]
-// Map types: OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK9-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 3]
+// Map types: OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK9-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 35]
 
 // CK9-LABEL: implicit_maps_array
 void implicit_maps_array (int a){
@@ -445,17 +445,17 @@ void implicit_maps_array (int a){
 // CK9: {{.+}} = getelementptr inbounds [2 x double], [2 x double]* [[REF]], i[[sz]] 0, i[[sz]] 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK10 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK10
-// RUN: %clang_cc1 -DCK10 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK10
+// RUN: %clang_cc1 -DCK10 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK10
 #ifdef CK10
 
-// CK10-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}]
-// Map types: OMP_MAP_BYCOPY | OMP_MAP_PTR = 128 + 32
-// CK10-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 160]
+// CK10-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] zeroinitializer
+// Map types: OMP_MAP_IS_FIRST = 32
+// CK10-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 32]
 
 // CK10-LABEL: implicit_maps_pointer
 void implicit_maps_pointer (){
@@ -487,17 +487,17 @@ void implicit_maps_pointer (){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK11 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK11
-// RUN: %clang_cc1 -DCK11 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK11
+// RUN: %clang_cc1 -DCK11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK11
 #ifdef CK11
 
 // CK11-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 16]
-// Map types: OMP_MAP_TO = 1
-// CK11-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_TO + OMP_MAP_IS_FIRST = 33
+// CK11-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK11-LABEL: implicit_maps_double_complex
 void implicit_maps_double_complex (int a){
@@ -527,22 +527,22 @@ void implicit_maps_double_complex (int a){
 // CK11: {{.+}} = getelementptr inbounds { double, double }, { double, double }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK12 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK12 --check-prefix CK12-64
-// RUN: %clang_cc1 -DCK12 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-64
-// RUN: %clang_cc1 -DCK12 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
-// RUN: %clang_cc1 -DCK12 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
+// RUN: %clang_cc1 -DCK12 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK12 --check-prefix CK12-64
+// RUN: %clang_cc1 -DCK12 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-64
+// RUN: %clang_cc1 -DCK12 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
+// RUN: %clang_cc1 -DCK12 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK12 --check-prefix CK12-32
 #ifdef CK12
 
 // For a 32-bit targets, the value doesn't fit the size of the pointer,
 // therefore it is passed by reference with a map 'to' specification.
 
 // CK12-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 8]
-// Map types: OMP_MAP_BYCOPY = 128
-// CK12-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
-// Map types: OMP_MAP_TO = 1
-// CK12-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 1]
+// Map types: OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK12-64-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
+// Map types: OMP_MAP_TO + OMP_MAP_IS_FIRST = 33
+// CK12-32-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 33]
 
 // CK12-LABEL: implicit_maps_float_complex
 void implicit_maps_float_complex (int a){
@@ -588,20 +588,20 @@ void implicit_maps_float_complex (int a){
 // CK12-32: {{.+}} = getelementptr inbounds { float, float }, { float, float }* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK13 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK13
-// RUN: %clang_cc1 -DCK13 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK13
+// RUN: %clang_cc1 -DCK13 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK13
 #ifdef CK13
 
 // We don't have a constant map size for VLAs.
 // Map types:
-//  - OMP_MAP_BYCOPY = 128 (vla size)
-//  - OMP_MAP_BYCOPY = 128 (vla size)
-//  - OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK13-DAG: [[TYPES:@.+]] = {{.+}}constant [3 x i32] [i32 128, i32 128, i32 3]
+//  - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288 (vla size)
+//  - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288 (vla size)
+//  - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK13-DAG: [[TYPES:@.+]] = {{.+}}constant [3 x i32] [i32 288, i32 288, i32 35]
 
 // CK13-LABEL: implicit_maps_variable_length_array
 void implicit_maps_variable_length_array (int a){
@@ -658,20 +658,20 @@ void implicit_maps_variable_length_array (int a){
 // CK13: {{.+}} = getelementptr inbounds double, double* [[REF]], i[[sz]] %{{.+}}
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK14 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK14 --check-prefix CK14-64
-// RUN: %clang_cc1 -DCK14 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-64
-// RUN: %clang_cc1 -DCK14 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
-// RUN: %clang_cc1 -DCK14 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
+// RUN: %clang_cc1 -DCK14 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK14 --check-prefix CK14-64
+// RUN: %clang_cc1 -DCK14 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-64
+// RUN: %clang_cc1 -DCK14 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
+// RUN: %clang_cc1 -DCK14 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK14 --check-prefix CK14-32
 #ifdef CK14
 
 // CK14-DAG: [[ST:%.+]] = type { i32, double }
 // CK14-DAG: [[SIZES:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{16|12}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK14-DAG: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK14-DAG: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 class SSS {
 public:
@@ -732,26 +732,26 @@ void implicit_maps_class (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK15 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK15 --check-prefix CK15-64
-// RUN: %clang_cc1 -DCK15 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-64
-// RUN: %clang_cc1 -DCK15 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
-// RUN: %clang_cc1 -DCK15 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
+// RUN: %clang_cc1 -DCK15 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK15 --check-prefix CK15-64
+// RUN: %clang_cc1 -DCK15 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-64
+// RUN: %clang_cc1 -DCK15 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
+// RUN: %clang_cc1 -DCK15 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK15 --check-prefix CK15-32
 #ifdef CK15
 
 // CK15: [[ST:%.+]] = type { i32, double, i32* }
 // CK15: [[SIZES:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{24|16}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK15: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK15: [[TYPES:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 // CK15: [[SIZES2:@.+]] = {{.+}}constant [2 x i[[sz]]] [i{{64|32}} {{24|16}}, i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_TO | OMP_MAP_FROM = 1 + 2
-// - OMP_MAP_BYCOPY = 128
-// CK15: [[TYPES2:@.+]] = {{.+}}constant [2 x i32] [i32 3, i32 128]
+// - OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK15: [[TYPES2:@.+]] = {{.+}}constant [2 x i32] [i32 35, i32 288]
 
 template<int x>
 class SSST {
@@ -860,18 +860,18 @@ void implicit_maps_templated_class (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK16 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK16 --check-prefix CK16-64
-// RUN: %clang_cc1 -DCK16 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-64
-// RUN: %clang_cc1 -DCK16 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
-// RUN: %clang_cc1 -DCK16 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
+// RUN: %clang_cc1 -DCK16 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK16 --check-prefix CK16-64
+// RUN: %clang_cc1 -DCK16 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-64
+// RUN: %clang_cc1 -DCK16 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
+// RUN: %clang_cc1 -DCK16 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK16 --check-prefix CK16-32
 #ifdef CK16
 
 // CK16-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_BYCOPY = 128
-// CK16-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK16-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 template<int y>
 int foo(int d) {
@@ -913,18 +913,18 @@ void implicit_maps_templated_function (int a){
 
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK17 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK17
-// RUN: %clang_cc1 -DCK17 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK17
+// RUN: %clang_cc1 -DCK17 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK17
 #ifdef CK17
 
 // CK17-DAG: [[ST:%.+]] = type { i32, double }
 // CK17-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} {{16|12}}]
-// Map types: OMP_MAP_TO + OMP_MAP_FROM = 2 + 1
-// CK17-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 3]
+// Map types: OMP_MAP_TO + OMP_MAP_FROM + OMP_MAP_IS_FIRST = 35
+// CK17-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 35]
 
 class SSS {
 public:
@@ -961,18 +961,18 @@ void implicit_maps_struct (int a){
 // CK17: {{.+}} = getelementptr inbounds [[ST]], [[ST]]* [[REF]], i32 0, i32 0
 #endif
 ///==========================================================================///
-// RUN: %clang_cc1 -DCK18 -verify -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK18 --check-prefix CK18-64
-// RUN: %clang_cc1 -DCK18 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-64
-// RUN: %clang_cc1 -DCK18 -verify -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
-// RUN: %clang_cc1 -DCK18 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -omptargets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-64
+// RUN: %clang_cc1 -DCK18 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
+// RUN: %clang_cc1 -DCK18 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK18 --check-prefix CK18-32
 #ifdef CK18
 
 // CK18-DAG: [[SIZES:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 4]
 // Map types:
-// - OMP_MAP_BYCOPY = 128
-// CK18-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 128]
+// - OMP_MAP_PRIVATE_VAL + OMP_MAP_IS_FIRST = 288
+// CK18-DAG: [[TYPES:@.+]] = {{.+}}constant [1 x i32] [i32 288]
 
 template<typename T>
 int foo(T d) {
@@ -1012,4 +1012,3463 @@ void implicit_maps_template_type_capture (int a){
 // CK18-32: {{.+}} = load i32, i32* [[ADDR]],
 
 #endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK19 --check-prefix CK19-64
+// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-64
+// RUN: %clang_cc1 -DCK19 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-32
+// RUN: %clang_cc1 -DCK19 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK19 --check-prefix CK19-32
+#ifdef CK19
+
+// CK19: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK19: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK19: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK19: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE08:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE08:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK19: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE11:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 240]
+// CK19: [[MTYPE11:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[SIZE12:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE12:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK19: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK19: [[SIZE15:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE15:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK19: [[MTYPE16:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 33]
+
+// CK19: [[SIZE17:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 240]
+// CK19: [[MTYPE17:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 34]
+
+// CK19: [[SIZE18:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 240]
+// CK19: [[MTYPE18:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[MTYPE19:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 32]
+
+// CK19: [[SIZE20:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE20:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 33]
+
+// CK19: [[MTYPE21:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[SIZE22:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE22:@.+]] = private {{.*}}constant [2 x i32] [i32 288, i32 35]
+
+// CK19: [[SIZE23:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE23:@.+]] = private {{.*}}constant [1 x i32] [i32 39]
+
+// CK19: [[SIZE24:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 480]
+// CK19: [[MTYPE24:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE25:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK19: [[MTYPE25:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE26:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 24]
+// CK19: [[MTYPE26:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE27:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK19: [[MTYPE27:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE28:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 16]
+// CK19: [[MTYPE28:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[SIZE29:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK19: [[MTYPE29:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[MTYPE30:@.+]] = private {{.*}}constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE31:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 40]
+// CK19: [[MTYPE31:@.+]] = private {{.*}}constant [4 x i32] [i32 288, i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE32:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE32:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE33:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE33:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE34:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 13728]
+// CK19: [[MTYPE34:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE35:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[SIZE36:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 208]
+// CK19: [[MTYPE36:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19: [[MTYPE37:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE38:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE39:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[MTYPE40:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE41:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 208]
+// CK19: [[MTYPE41:@.+]] = private {{.*}}constant [3 x i32] [i32 288, i32 288, i32 35]
+
+// CK19: [[SIZE42:@.+]] = private {{.*}}constant [3 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 104]
+// CK19: [[MTYPE42:@.+]] = private {{.*}}constant [3 x i32] [i32 35, i32 19, i32 19]
+
+// CK19: [[MTYPE43:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK19-LABEL: explicit_maps_single
+void explicit_maps_single (int ii){
+  // Map of a scalar.
+  int a = ii;
+
+  // Region 00
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:a)
+  {
+    ++a;
+  }
+
+  // Map of an array.
+  int arra[100];
+
+  // Region 01
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [100 x i32]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL01:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(to:arra)
+  {
+    arra[50]++;
+  }
+
+  // Region 02
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 20
+
+  // CK19: call void [[CALL02:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(from:arra[20:60])
+  {
+    arra[50]++;
+  }
+
+  // Region 03
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL03:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[:60])
+  {
+    arra[50]++;
+  }
+
+  // Region 04
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL04:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(alloc:arra[:])
+  {
+    arra[50]++;
+  }
+
+  // Region 05
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 15
+
+  // CK19: call void [[CALL05:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(to:arra[15])
+  {
+    arra[15]++;
+  }
+
+  // Region 06
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} %{{.*}}
+
+  // CK19: call void [[CALL06:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[ii:ii+23])
+  {
+    arra[50]++;
+  }
+
+  // Region 07
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE07]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL07:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(alloc:arra[:ii])
+  {
+    arra[50]++;
+  }
+
+  // Region 08
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE08]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [100 x i32]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[100 x i32]* [[VAR0]], i{{.+}} 0, i{{.+}} %{{.*}}
+
+  // CK19: call void [[CALL08:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(tofrom:arra[ii])
+  {
+    arra[15]++;
+  }
+
+  // Map of a pointer.
+  int *pa;
+
+  // Region 09
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32** [[VAR0]] to i8*
+
+  // CK19: call void [[CALL09:@.+]](i32** {{[^,]+}})
+  #pragma omp target map(from:pa)
+  {
+    pa[50]++;
+  }
+
+  // Region 10
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 20
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL10:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(tofrom:pa[20:60])
+  {
+    pa[50]++;
+  }
+
+  // Region 11
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE11]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL11:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:pa[:60])
+  {
+    pa[50]++;
+  }
+
+  // Region 12
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE12]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 15
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL12:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:pa[15])
+  {
+    pa[15]++;
+  }
+
+  // Region 13
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.*}}
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL13:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(alloc:pa[ii-23:ii])
+  {
+    pa[50]++;
+  }
+
+  // Region 14
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} [[CSVAL0:%[^,]+]], i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[CSVAL0]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL14:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:pa[:ii])
+  {
+    pa[50]++;
+  }
+
+  // Region 15
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE15]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE15]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.*}}
+  // CK19-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK19: call void [[CALL15:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(from:pa[ii+12])
+  {
+    pa[15]++;
+  }
+
+  // Map of a variable-size array.
+  int va[ii];
+
+  // Region 16
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE16]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[VAR1]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+
+  // CK19: call void [[CALL16:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(to:va)
+  {
+   va[50]++;
+  }
+
+  // Region 17
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE17]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE17]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 20
+
+  // CK19: call void [[CALL17:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(from:va[20:60])
+  {
+   va[50]++;
+  }
+
+  // Region 18
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE18]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE18]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 0
+
+  // CK19: call void [[CALL18:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[:60])
+  {
+   va[50]++;
+  }
+
+  // Region 19
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE19]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 0
+
+  // CK19: call void [[CALL19:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(alloc:va[:])
+  {
+   va[50]++;
+  }
+
+  // Region 20
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE20]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE20]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} 15
+
+  // CK19: call void [[CALL20:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(to:va[15])
+  {
+   va[15]++;
+  }
+
+  // Region 21
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE21]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i{{.+}} {{8|4}}, i{{.+}}* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i{{.+}} [[CSVAL1:%[^,]+]], i{{.+}}* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[CSVAL1]] = mul nuw i{{.+}} %{{.*}}, 4
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} %{{.+}}
+
+  // CK19: call void [[CALL21:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[ii:ii+23])
+  {
+   va[50]++;
+  }
+
+  // Region 22
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE22]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE22]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = inttoptr i[[Z]] %{{.+}} to i8*
+  // CK19-DAG: [[CPVAL0]] = inttoptr i[[Z]] %{{.+}}to i8*
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32* [[VAR1]], i{{.+}} %{{.+}}
+
+  // CK19: call void [[CALL22:@.+]](i{{.+}} {{[^,]+}}, i32* {{[^,]+}})
+  #pragma omp target map(tofrom:va[ii])
+  {
+   va[15]++;
+  }
+
+  // Always.
+  // Region 23
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE23]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE23]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL23:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(always, tofrom: a)
+  {
+   a++;
+  }
+
+  // Multidimensional arrays.
+  int marr[4][5][6];
+  int ***mptr;
+
+  // Region 24
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE24]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE24]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL24:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr)
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 25
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE25]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE25]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL25:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][2:4])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 26
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE26]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE26]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL26:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][:])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 27
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE27]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE27]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [4 x [5 x [6 x i32]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}[6 x i32]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[SEC00]] = getelementptr {{.*}}[5 x [6 x i32]]* [[SEC000:[^,]+]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[SEC000]] = getelementptr {{.*}}[4 x [5 x [6 x i32]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+  // CK19: call void [[CALL27:@.+]]([4 x [5 x [6 x i32]]]* {{[^,]+}})
+  #pragma omp target map(tofrom: marr[1][2][3])
+  {
+   marr[1][2][3]++;
+  }
+
+  // Region 28
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE28]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE28]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load i32***, i32**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32*** [[SEC00:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC00]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load i32**, i32*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}i32*** [[SEC1111:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC1111]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast i32** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast i32* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}i32* [[SEC22:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC22]] = load i32*, i32** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}i32** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load i32**, i32*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}i32*** [[SEC222222:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC222222]] = load i32***, i32**** [[PTR]],
+
+  // CK19: call void [[CALL28:@.+]](i32*** {{[^,]+}})
+  #pragma omp target map(tofrom: mptr[1][2][2:4])
+  {
+    mptr[1][2][3]++;
+  }
+
+  // Region 29
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE29]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE29]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast i32*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast i32*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load i32***, i32**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}i32*** [[SEC00:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC00]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast i32*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast i32** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}i32** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load i32**, i32*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}i32*** [[SEC1111:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC1111]] = load i32***, i32**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast i32** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast i32* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}i32* [[SEC22:[^,]+]], i{{.+}} 3
+  // CK19-DAG: [[SEC22]] = load i32*, i32** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}i32** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load i32**, i32*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}i32*** [[SEC222222:[^,]+]], i{{.+}} 1
+  // CK19-DAG: [[SEC222222]] = load i32***, i32**** [[PTR]],
+
+  // CK19: call void [[CALL29:@.+]](i32*** {{[^,]+}})
+  #pragma omp target map(tofrom: mptr[1][2][3])
+  {
+    mptr[1][2][3]++;
+  }
+
+  // Multidimensional VLA.
+  double mva[23][ii][ii+5];
+
+  // Region 30
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE30]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  // CK19-64-DAG: [[VAR1]] = zext i32 %{{[^,]+}} to i64
+  // CK19-64-DAG: [[VAR11]] = zext i32 %{{[^,]+}} to i64
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = inttoptr i[[Z]] [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = inttoptr i[[Z]] [[VAR22:%.+]] to i8*
+  // CK19-64-DAG: [[VAR2]] = zext i32 %{{[^,]+}} to i64
+  // CK19-64-DAG: [[VAR22]] = zext i32 %{{[^,]+}} to i64
+  //
+  // CK19-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[S3:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+  // CK19-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+  // CK19-DAG: store i[[Z]] [[CSVAL3:%[^,]+]], i[[Z]]* [[S3]]
+  // CK19-DAG: [[CBPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+  // CK19-DAG: [[CPVAL3]] = bitcast double* [[VAR3]] to i8*
+  // CK19-DAG: [[CSVAL3]] = mul nuw i[[Z]] %{{[^,]+}}, {{8|4}}
+
+  // CK19: call void [[CALL30:@.+]](i[[Z]] 23, i[[Z]] %{{[^,]+}}, i[[Z]] %{{[^,]+}}, double* %{{[^,]+}})
+  #pragma omp target map(tofrom: mva)
+  {
+    mva[1][2][3]++;
+  }
+
+  // Region 31
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE31]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE31]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 23 to i8*), i8** [[P0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = inttoptr i[[Z]] [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = inttoptr i[[Z]] [[VAR22:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+  // CK19-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+  // CK19-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+  // CK19-DAG: [[CBPVAL3]] = bitcast double* [[VAR3:%.+]] to i8*
+  // CK19-DAG: [[CPVAL3]] = bitcast double* [[SEC3:%.+]] to i8*
+  // CK19-DAG: [[SEC3]] = getelementptr {{.*}}double* [[SEC33:%.+]], i[[Z]] 0
+  // CK19-DAG: [[SEC33]] = getelementptr {{.*}}double* [[SEC333:%.+]], i[[Z]] [[IDX3:%.+]]
+  // CK19-DAG: [[IDX3]] = mul nsw i[[Z]] %{{[^,]+}}, %{{[^,]+}}
+  // CK19-DAG: [[SEC333]] = getelementptr {{.*}}double* [[VAR3]], i[[Z]] [[IDX33:%.+]]
+  // CK19-DAG: [[IDX33]] = mul nsw i[[Z]] 1, %{{[^,]+}}
+
+  // CK19: call void [[CALL31:@.+]](i[[Z]] 23, i[[Z]] %{{[^,]+}}, i[[Z]] %{{[^,]+}}, double* %{{[^,]+}})
+  #pragma omp target map(tofrom: mva[1][ii-2][:5])
+  {
+    mva[1][2][3]++;
+  }
+
+  // Multidimensional array sections.
+  double marras[11][12][13];
+  double mvlaas[11][ii][13];
+  double ***mptras;
+
+  // Region 32
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE32]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE32]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0]] to i8*
+
+  // CK19: call void [[CALL32:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras)
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 33
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE33]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE33]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [12 x [13 x double]]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 0
+
+  // CK19: call void [[CALL33:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 34
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE34]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE34]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [12 x [13 x double]]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 0
+
+  // CK19: call void [[CALL34:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:][:][:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 35
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE35]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i[[Z]] [[CSVAL0:%[^,]+]], i[[Z]]* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC00:%[^,]+]], i[[Z]] 0, i[[Z]] 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 1
+  // CK19-DAG: [[CSVAL0]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL35:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[1][:ii][:])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 36
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE36]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE36]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[13 x double]* [[SEC00:%[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[SEC000]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19: call void [[CALL36:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[:1][:2][:13])
+  {
+    marras[1][2][3]++;
+  }
+
+  // Region 37
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE37]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[VAR2]] to i8*
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL37:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas)
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 38
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE38]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC22:%[^,]+]]
+  // CK19-DAG: [[SEC22]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL38:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 39
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE39]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC22:%[^,]+]]
+  // CK19-DAG: [[SEC22]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+  // CK19-DAG: [[CSVAL2]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL39:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:][:][:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 40
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE40]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[S1:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: store i[[Z]] {{8|4}}, i[[Z]]* [[S1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[S2:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: store i[[Z]] [[CSVAL2:%[^,]+]], i[[Z]]* [[S2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[SEC22:%[^,]+]], i[[Z]] 0
+  // CK19-DAG: [[SEC22]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC222:%[^,]+]]
+  // CK19-DAG: [[SEC222]] = mul nsw i[[Z]] 1, %{{[^,]+}}
+
+  // CK19: call void [[CALL40:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[1][:ii][:])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 41
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE41]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE41]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[BP0]]
+  // CK19-DAG: store i8* inttoptr (i[[Z]] 11 to i8*), i8** [[P0]]
+  //
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = inttoptr i[[Z]] [[VAR1:%.+]] to i8*
+  // CK19-DAG: [[CPVAL1]] = inttoptr i[[Z]] [[VAR11:%.+]] to i8*
+  //
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast [13 x double]* [[VAR2:%.+]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast [13 x double]* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.+}}[13 x double]* [[SEC22:%[^,]+]], i[[Z]] 0
+  // CK19-DAG: [[SEC22]] = getelementptr {{.+}}[13 x double]* [[VAR2]], i[[Z]] [[SEC222:%[^,]+]]
+  // CK19-DAG: [[SEC222]] = mul nsw i[[Z]] 0, %{{[^,]+}}
+
+  // CK19: call void [[CALL41:@.+]](i[[Z]] 11, i[[Z]] %{{[^,]+}}, [13 x double]* %{{[^,]+}})
+  #pragma omp target map(mvlaas[:1][:2][:13])
+  {
+    mvlaas[1][2][3]++;
+  }
+
+  // Region 42
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 3, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[SIZE42]], {{.+}}getelementptr {{.+}}[3 x i{{.+}}]* [[MTYPE42]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast double*** [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast double*** [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[VAR0]] = load double***, double**** [[PTR:%[^,]+]],
+  // CK19-DAG: [[SEC0]] = getelementptr {{.*}}double*** [[SEC00:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC00]] = load double***, double**** [[PTR]],
+
+  // CK19-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK19-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+  // CK19-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK19-DAG: [[CBPVAL1]] = bitcast double*** [[SEC0]] to i8*
+  // CK19-DAG: [[CPVAL1]] = bitcast double** [[SEC1:%.+]] to i8*
+  // CK19-DAG: [[SEC1]] = getelementptr {{.*}}double** [[SEC11:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC11]] = load double**, double*** [[SEC111:%[^,]+]],
+  // CK19-DAG: [[SEC111]] = getelementptr {{.*}}double*** [[SEC1111:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC1111]] = load double***, double**** [[PTR]],
+
+  // CK19-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+  // CK19-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+  // CK19-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+  // CK19-DAG: [[CBPVAL2]] = bitcast double** [[SEC1]] to i8*
+  // CK19-DAG: [[CPVAL2]] = bitcast double* [[SEC2:%.+]] to i8*
+  // CK19-DAG: [[SEC2]] = getelementptr {{.*}}double* [[SEC22:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC22]] = load double*, double** [[SEC222:%[^,]+]],
+  // CK19-DAG: [[SEC222]] = getelementptr {{.*}}double** [[SEC2222:[^,]+]], i{{.+}} 2
+  // CK19-DAG: [[SEC2222]] = load double**, double*** [[SEC22222:%[^,]+]],
+  // CK19-DAG: [[SEC22222]] = getelementptr {{.*}}double*** [[SEC222222:[^,]+]], i{{.+}} 0
+  // CK19-DAG: [[SEC222222]] = load double***, double**** [[PTR]],
+
+  // CK19: call void [[CALL42:@.+]](double*** {{[^,]+}})
+  #pragma omp target map(mptras[:1][2][:13])
+  {
+    mptras[1][2][3]++;
+  }
+
+  // Region 43 - the memory is not contiguous for this map - will map the whole last dimension.
+  // CK19-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[Z]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE43]]{{.+}})
+  // CK19-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK19-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK19-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+  //
+  // CK19-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK19-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+
+  // CK19-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK19-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK19-DAG: store i[[Z]] [[CSVAL0:%[^,]+]], i[[Z]]* [[S0]]
+  // CK19-DAG: [[CBPVAL0]] = bitcast [11 x [12 x [13 x double]]]* [[VAR0:%.+]] to i8*
+  // CK19-DAG: [[CPVAL0]] = bitcast [13 x double]* [[SEC0:%.+]] to i8*
+  // CK19-DAG: [[SEC0]] = getelementptr {{.+}}[12 x [13 x double]]* [[SEC00:%[^,]+]], i[[Z]] 0, i[[Z]] 0
+  // CK19-DAG: [[SEC00]] = getelementptr {{.+}}[11 x [12 x [13 x double]]]* [[VAR0]], i[[Z]] 0, i[[Z]] 1
+  // CK19-DAG: [[CSVAL0]] = mul nuw i[[Z]] %{{[^,]+}}, 104
+
+  // CK19: call void [[CALL43:@.+]]([11 x [12 x [13 x double]]]* {{[^,]+}})
+  #pragma omp target map(marras[1][:ii][1:])
+  {
+    marras[1][2][3]++;
+  }
+
+}
+
+// CK19: define {{.+}}[[CALL00]]
+// CK19: define {{.+}}[[CALL01]]
+// CK19: define {{.+}}[[CALL02]]
+// CK19: define {{.+}}[[CALL03]]
+// CK19: define {{.+}}[[CALL04]]
+// CK19: define {{.+}}[[CALL05]]
+// CK19: define {{.+}}[[CALL06]]
+// CK19: define {{.+}}[[CALL07]]
+// CK19: define {{.+}}[[CALL08]]
+// CK19: define {{.+}}[[CALL09]]
+// CK19: define {{.+}}[[CALL10]]
+// CK19: define {{.+}}[[CALL11]]
+// CK19: define {{.+}}[[CALL12]]
+// CK19: define {{.+}}[[CALL13]]
+// CK19: define {{.+}}[[CALL14]]
+// CK19: define {{.+}}[[CALL15]]
+// CK19: define {{.+}}[[CALL16]]
+// CK19: define {{.+}}[[CALL17]]
+// CK19: define {{.+}}[[CALL18]]
+// CK19: define {{.+}}[[CALL19]]
+// CK19: define {{.+}}[[CALL20]]
+// CK19: define {{.+}}[[CALL21]]
+// CK19: define {{.+}}[[CALL22]]
+// CK19: define {{.+}}[[CALL23]]
+// CK19: define {{.+}}[[CALL24]]
+// CK19: define {{.+}}[[CALL25]]
+// CK19: define {{.+}}[[CALL26]]
+// CK19: define {{.+}}[[CALL27]]
+// CK19: define {{.+}}[[CALL28]]
+// CK19: define {{.+}}[[CALL29]]
+// CK19: define {{.+}}[[CALL30]]
+// CK19: define {{.+}}[[CALL31]]
+// CK19: define {{.+}}[[CALL32]]
+// CK19: define {{.+}}[[CALL33]]
+// CK19: define {{.+}}[[CALL34]]
+// CK19: define {{.+}}[[CALL35]]
+// CK19: define {{.+}}[[CALL36]]
+// CK19: define {{.+}}[[CALL37]]
+// CK19: define {{.+}}[[CALL38]]
+// CK19: define {{.+}}[[CALL39]]
+// CK19: define {{.+}}[[CALL40]]
+// CK19: define {{.+}}[[CALL41]]
+// CK19: define {{.+}}[[CALL42]]
+// CK19: define {{.+}}[[CALL43]]
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK20 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK20 --check-prefix CK20-64
+// RUN: %clang_cc1 -DCK20 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-64
+// RUN: %clang_cc1 -DCK20 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-32
+// RUN: %clang_cc1 -DCK20 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK20 --check-prefix CK20-32
+#ifdef CK20
+
+// CK20: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK20: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK20: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK20: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK20: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK20: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK20: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 12]
+// CK20: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK20-LABEL: explicit_maps_references_and_function_args
+void explicit_maps_references_and_function_args (int a, float b, int (&c)[10], float *d){
+
+  int &aa = a;
+  float &bb = b;
+  int (&cc)[10] = c;
+  float *&dd = d;
+
+  // Region 00
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast i32* [[RVAR00:%.+]] to i8*
+  // CK20-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK20: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(to:aa)
+  {
+    aa += 1;
+  }
+
+  // Region 01
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast [10 x i32]* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK20-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[RVAR00:%.+]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[RVAR0]] = load [10 x i32]*, [10 x i32]** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[RVAR00]] = load [10 x i32]*, [10 x i32]** [[VAR0]]
+
+  // CK20: call void [[CALL01:@.+]]([10 x i32]* {{[^,]+}})
+  #pragma omp target map(to:cc[:5])
+  {
+    cc[3] += 1;
+  }
+
+  // Region 02
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+
+  // CK20: call void [[CALL02:@.+]](float* {{[^,]+}})
+  #pragma omp target map(from:b)
+  {
+    b += 1.0f;
+  }
+
+  // Region 03
+  // CK20-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK20-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK20-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK20-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK20-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK20-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK20-DAG: [[CBPVAL0]] = bitcast float* [[RVAR0:%.+]] to i8*
+  // CK20-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+  // CK20-DAG: [[RVAR0]] = load float*, float** [[VAR0:%[^,]+]]
+  // CK20-DAG: [[SEC0]] = getelementptr {{.*}}float* [[RVAR00:%.+]], i{{.+}} 2
+  // CK20-DAG: [[RVAR00]] = load float*, float** [[VAR0]]
+
+  // CK20: call void [[CALL03:@.+]](float* {{[^,]+}})
+  #pragma omp target map(from:d[2:3])
+  {
+    d[2] += 1.0f;
+  }
+}
+
+// CK20: define {{.+}}[[CALL00]]
+// CK20: define {{.+}}[[CALL01]]
+// CK20: define {{.+}}[[CALL02]]
+// CK20: define {{.+}}[[CALL03]]
+
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK21 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK21 --check-prefix CK21-64
+// RUN: %clang_cc1 -DCK21 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-64
+// RUN: %clang_cc1 -DCK21 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-32
+// RUN: %clang_cc1 -DCK21 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK21 --check-prefix CK21-32
+#ifdef CK21
+// CK21: [[ST:%.+]] = type { i32, i32, float* }
+
+// CK21: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK21: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK21: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 492]
+// CK21: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK21: [[SIZE02:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 500]
+// CK21: [[MTYPE02:@.+]] = private {{.*}}constant [2 x i32] [i32 34, i32 18]
+
+// CK21: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 492]
+// CK21: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK21: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK21: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 34]
+
+// CK21: [[SIZE05:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] 4, i[[Z]] 4]
+// CK21: [[MTYPE05:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 3]
+
+// CK21-LABEL: explicit_maps_template_args_and_members
+
+template <int X, typename T>
+struct CC {
+  T A;
+  int A2;
+  float *B;
+
+  int foo(T arg) {
+    float la[X];
+    T *lb;
+
+    // Region 00
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0:%.+]], i{{.+}} 0, i{{.+}} 0
+
+    // CK21: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(A)
+    {
+      A += 1;
+    }
+    
+    // Region 01
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+    // CK21-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+    // CK21: call void [[CALL01:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(lb[:X])
+    {
+      lb[4] += 1;
+    }
+    
+    // Region 02
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE02]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast float** [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+    
+    // CK21-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK21-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK21-DAG: [[CBPVAL1]] = bitcast float** [[SEC0]] to i8*
+    // CK21-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+    // CK21-DAG: [[SEC1]] = getelementptr {{.*}}float* [[RVAR1:%[^,]+]], i{{.+}} 123
+    // CK21-DAG: [[RVAR1]] = load float*, float** [[SEC1_:%[^,]+]]
+    // CK21-DAG: [[SEC1_]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+    // CK21: call void [[CALL02:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(from:B[X:X+2])
+    {
+      B[2] += 1.0f;
+    }
+    
+    // Region 03
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [123 x float]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast [123 x float]* [[VAR0]] to i8*
+
+    // CK21: call void [[CALL03:@.+]]([123 x float]* {{[^,]+}})
+    #pragma omp target map(from:la)
+    {
+      la[3] += 1.0f;
+    }
+    
+    // Region 04
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+    // CK21: call void [[CALL04:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(from:arg)
+    {
+      arg +=1;
+    }
+    
+    // Make sure the extra flag is passed to the second map.
+    // Region 05
+    // CK21-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE05]]{{.+}})
+    // CK21-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK21-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK21-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK21-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK21-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK21-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK21-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK21-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+    // CK21-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+    // CK21-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+    // CK21-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+    // CK21-DAG: [[CBPVAL1]] = bitcast [[ST]]* [[VAR1:%.+]] to i8*
+    // CK21-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+    // CK21-DAG: [[SEC1]] = getelementptr {{.*}}[[ST]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+    // CK21: call void [[CALL05:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(A, A2)
+    {
+      A += 1;
+      A2 += 1;
+    }
+    return A;
+  }
+};
+
+int explicit_maps_template_args_and_members(int a){
+  CC<123,int> c;
+  return c.foo(a);
+}
+
+// CK21: define {{.+}}[[CALL00]]
+// CK21: define {{.+}}[[CALL01]]
+// CK21: define {{.+}}[[CALL02]]
+// CK21: define {{.+}}[[CALL03]]
+// CK21: define {{.+}}[[CALL04]]
+// CK21: define {{.+}}[[CALL05]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK22 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK22 --check-prefix CK22-64
+// RUN: %clang_cc1 -DCK22 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-64
+// RUN: %clang_cc1 -DCK22 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-32
+// RUN: %clang_cc1 -DCK22 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK22 --check-prefix CK22-32
+#ifdef CK22
+
+// CK22-DAG: [[ST:%.+]] = type { float }
+// CK22-DAG: [[STT:%.+]] = type { i32 }
+
+// CK22: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK22: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK22: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE06:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE07:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE08:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE08:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK22: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE11:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK22: [[MTYPE11:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE12:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK22: [[MTYPE12:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE13:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK22: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK22: [[SIZE14:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK22: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+int a;
+int c[100];
+int *d;
+
+struct ST {
+  float fa;
+};
+
+ST sa ;
+ST sc[100];
+ST *sd;
+
+template<typename T>
+struct STT {
+  T fa;
+};
+
+STT<int> sta ;
+STT<int> stc[100];
+STT<int> *std;
+
+// CK22-LABEL: explicit_maps_globals
+int explicit_maps_globals(void){
+  // Region 00
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast (i32* @a to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32* @a to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(a)
+  { a+=1; }
+  
+  // Region 01
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL01:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(c)
+  { c[3]+=1; }
+  
+  // Region 02
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast (i32** @d to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32** @d to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL02:@.+]](i32** {{[^,]+}})
+  #pragma omp target map(d)
+  { d[3]+=1; }
+    
+  // Region 03
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x i32]* @c to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast (i32* getelementptr inbounds ([100 x i32], [100 x i32]* @c, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL03:@.+]]([100 x i32]* {{[^,]+}})
+  #pragma omp target map(c[1:4])
+  { c[3]+=1; }
+  
+  // Region 04
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load i32*, i32** @d
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load i32*, i32** @d
+
+  // CK22: call void [[CALL04:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(d[2:5])
+  { d[3]+=1; }
+  
+  // Region 05
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[ST]]* @sa to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]* @sa to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL05:@.+]]([[ST]]* {{[^,]+}})
+  #pragma omp target map(sa)
+  { sa.fa+=1; }
+  
+  // Region 06
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE06]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL06:@.+]]([100 x [[ST]]]* {{[^,]+}})
+  #pragma omp target map(sc)
+  { sc[3].fa+=1; }
+  
+  // Region 07
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE07]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE07]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[ST]]** @sd to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]** @sd to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL07:@.+]]([[ST]]** {{[^,]+}})
+  #pragma omp target map(sd)
+  { sd[3].fa+=1; }
+  
+  // Region 08
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE08]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[ST]]]* @sc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[ST]]* getelementptr inbounds ([100 x [[ST]]], [100 x [[ST]]]* @sc, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL08:@.+]]([100 x [[ST]]]* {{[^,]+}})
+  #pragma omp target map(sc[1:4])
+  { sc[3].fa+=1; }
+  
+  // Region 09
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast [[ST]]* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load [[ST]]*, [[ST]]** @sd
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load [[ST]]*, [[ST]]** @sd
+
+  // CK22: call void [[CALL09:@.+]]([[ST]]* {{[^,]+}})
+  #pragma omp target map(sd[2:5])
+  { sd[3].fa+=1; }
+  
+  // Region 10
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[STT]]* @sta to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]* @sta to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL10:@.+]]([[STT]]* {{[^,]+}})
+  #pragma omp target map(sta)
+  { sta.fa+=1; }
+  
+  // Region 11
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE11]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL11:@.+]]([100 x [[STT]]]* {{[^,]+}})
+  #pragma omp target map(stc)
+  { stc[3].fa+=1; }
+  
+  // Region 12
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE12]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([[STT]]** @std to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]** @std to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL12:@.+]]([[STT]]** {{[^,]+}})
+  #pragma omp target map(std)
+  { std[3].fa+=1; }
+  
+  // Region 13
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE13]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* bitcast ([100 x [[STT]]]* @stc to i8*), i8** [[BP0]]
+  // CK22-DAG: store i8* bitcast ([[STT]]* getelementptr inbounds ([100 x [[STT]]], [100 x [[STT]]]* @stc, i{{.+}} 0, i{{.+}} 1) to i8*), i8** [[P0]]
+
+  // CK22: call void [[CALL13:@.+]]([100 x [[STT]]]* {{[^,]+}})
+  #pragma omp target map(stc[1:4])
+  { stc[3].fa+=1; }
+  
+  // Region 14
+  // CK22-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE14]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+  // CK22-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK22-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK22-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK22-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK22-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK22-DAG: [[CBPVAL0]] = bitcast [[STT]]* [[RVAR0:%.+]] to i8*
+  // CK22-DAG: [[CPVAL0]] = bitcast [[STT]]* [[SEC0:%.+]] to i8*
+  // CK22-DAG: [[RVAR0]] = load [[STT]]*, [[STT]]** @std
+  // CK22-DAG: [[SEC0]] = getelementptr {{.*}}[[STT]]* [[RVAR00:%.+]], i{{.+}} 2
+  // CK22-DAG: [[RVAR00]] = load [[STT]]*, [[STT]]** @std
+
+  // CK22: call void [[CALL14:@.+]]([[STT]]* {{[^,]+}})
+  #pragma omp target map(std[2:5])
+  { std[3].fa+=1; }
+
+  return 0;
+}
+// CK22: define {{.+}}[[CALL00]]
+// CK22: define {{.+}}[[CALL01]]
+// CK22: define {{.+}}[[CALL02]]
+// CK22: define {{.+}}[[CALL03]]
+// CK22: define {{.+}}[[CALL04]]
+// CK22: define {{.+}}[[CALL05]]
+// CK22: define {{.+}}[[CALL06]]
+// CK22: define {{.+}}[[CALL07]]
+// CK22: define {{.+}}[[CALL08]]
+// CK22: define {{.+}}[[CALL09]]
+// CK22: define {{.+}}[[CALL10]]
+// CK22: define {{.+}}[[CALL11]]
+// CK22: define {{.+}}[[CALL12]]
+// CK22: define {{.+}}[[CALL13]]
+// CK22: define {{.+}}[[CALL14]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -std=c++11 -DCK23 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -std=c++11 -DCK23 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-64
+// RUN: %clang_cc1 -std=c++11 -DCK23 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+// RUN: %clang_cc1 -std=c++11 -DCK23 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK23 --check-prefix CK23-32
+#ifdef CK23
+
+// CK23: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK23: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK23: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 400]
+// CK23: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK23: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK23: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 16]
+// CK23: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK23-LABEL: explicit_maps_inside_captured
+int explicit_maps_inside_captured(int a){
+  float b;
+  float c[100];
+  float *d;
+  
+  // CK23: call void @{{.*}}explicit_maps_inside_captured{{.*}}([[SA:%.+]]* {{.*}}) 
+  // CK23: define {{.*}}explicit_maps_inside_captured{{.*}}
+  [&](void){
+    // Region 00
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast i32* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load i32*, i32** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load i32*, i32** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL00:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(a)
+      { a+=1; }
+    // Region 01
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load float*, float** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float*, float** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL01:@.+]](float* {{[^,]+}})
+    #pragma omp target map(b)
+      { b+=1; }
+    // Region 02
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast [100 x float]* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast [100 x float]* [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load [100 x float]*, [100 x float]** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load [100 x float]*, [100 x float]** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL02:@.+]]([100 x float]* {{[^,]+}})
+    #pragma omp target map(c)
+      { c[3]+=1; }
+      
+    // Region 03
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float** [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float** [[VAR00:%.+]] to i8*
+    // CK23-DAG: [[VAR0]] = load float**, float*** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float**, float*** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+
+    // CK23: call void [[CALL03:@.+]](float** {{[^,]+}})
+    #pragma omp target map(d)
+      { d[3]+=1; }
+    // Region 04
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast [100 x float]* [[VAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+    // CK23-DAG: [[SEC0]] = getelementptr {{.*}}[100 x float]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+    // CK23-DAG: [[VAR0]] = load [100 x float]*, [100 x float]** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load [100 x float]*, [100 x float]** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+    
+    // CK23: call void [[CALL04:@.+]]([100 x float]* {{[^,]+}})
+    #pragma omp target map(c[2:4])
+      { c[3]+=1; }
+      
+    // Region 05
+    // CK23-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+    // CK23-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK23-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK23-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK23-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK23-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK23-DAG: [[CBPVAL0]] = bitcast float* [[RVAR0:%.+]] to i8*
+    // CK23-DAG: [[CPVAL0]] = bitcast float* [[SEC0:%.+]] to i8*
+    // CK23-DAG: [[RVAR0]] = load float*, float** [[VAR0:%[^,]+]]
+    // CK23-DAG: [[SEC0]] = getelementptr {{.*}}float* [[RVAR00:%.+]], i{{.+}} 2
+    // CK23-DAG: [[RVAR00]] = load float*, float** [[VAR00:%[^,]+]]
+    // CK23-DAG: [[VAR0]] = load float**, float*** [[CAP0:%[^,]+]]
+    // CK23-DAG: [[CAP0]] = getelementptr inbounds [[SA]], [[SA]]
+    // CK23-DAG: [[VAR00]] = load float**, float*** [[CAP00:%[^,]+]]
+    // CK23-DAG: [[CAP00]] = getelementptr inbounds [[SA]], [[SA]]
+        
+    // CK23: call void [[CALL05:@.+]](float* {{[^,]+}})
+    #pragma omp target map(d[2:4])
+      { d[3]+=1; }
+  }();
+  return b;
+}
+
+// CK23: define {{.+}}[[CALL00]]
+// CK23: define {{.+}}[[CALL01]]
+// CK23: define {{.+}}[[CALL02]]
+// CK23: define {{.+}}[[CALL03]]
+// CK23: define {{.+}}[[CALL04]]
+// CK23: define {{.+}}[[CALL05]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK24 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -DCK24 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-64
+// RUN: %clang_cc1 -DCK24 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-32
+// RUN: %clang_cc1 -DCK24 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK24 --check-prefix CK24-32
+#ifdef CK24
+
+// CK24-DAG: [[SC:%.+]] = type { i32, [[SB:%.+]], [[SB:%.+]]*, [10 x i32] }
+// CK24-DAG: [[SB]] = type { i32, [[SA:%.+]], [10 x [[SA:%.+]]], [10 x [[SA:%.+]]*], [[SA:%.+]]* }
+// CK24-DAG: [[SA]] = type { i32, [[SA]]*, [10 x i32] }
+
+struct SA{
+  int a;
+  struct SA *p;
+  int b[10];
+};
+struct SB{
+  int a;
+  struct SA s;
+  struct SA sa[10];
+  struct SA *sp[10];
+  struct SA *p;
+};
+struct SC{
+  int a;
+  struct SB s;
+  struct SB *p;
+  int b[10];
+};
+
+// CK24: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK24: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{56|48}}]
+// CK24: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE04:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK24: [[MTYPE04:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE05:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{3560|2880}}]
+// CK24: [[MTYPE05:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE06:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE06:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE07:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE07:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE08:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE08:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE09:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE09:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE10:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 8]
+// CK24: [[MTYPE10:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE11:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}]
+// CK24: [[MTYPE11:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE12:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE12:@.+]] = private {{.*}}constant [4 x i32] [i32 35, i32 19, i32 19, i32 19]
+
+// CK24: [[SIZE13:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE13:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE14:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{56|48}}]
+// CK24: [[MTYPE14:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE15:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE15:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE16:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 20]
+// CK24: [[MTYPE16:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE17:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{3560|2880}}]
+// CK24: [[MTYPE17:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE18:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK24: [[MTYPE18:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE19:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE19:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE20:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE20:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE21:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE21:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE22:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] {{8|4}}]
+// CK24: [[MTYPE22:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK24: [[SIZE23:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}]
+// CK24: [[MTYPE23:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 19]
+
+// CK24: [[SIZE24:@.+]] = private {{.*}}constant [4 x i[[Z]]] [i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] {{8|4}}, i[[Z]] 4]
+// CK24: [[MTYPE24:@.+]] = private {{.*}}constant [4 x i32] [i32 35, i32 19, i32 19, i32 19]
+
+// CK24-LABEL: explicit_maps_struct_fields
+int explicit_maps_struct_fields(int a){
+  SC s;
+  SC *p;
+
+// Region 01
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 0
+
+// CK24: call void [[CALL01:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.a)
+  { s.a++; }
+  
+// Region 02
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL02:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s)
+  { s.a++; }
+  
+// Region 03
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL03:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s.a)
+  { s.a++; }
+
+// Region 04
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE04]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 3
+
+// CK24: call void [[CALL04:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.b[:5])
+  { s.a++; }
+  
+// Region 05
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE05]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SB]]* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL05:@.+]]([[SC]]* {{[^,]+}})  
+#pragma omp target map(s.p[:5])
+  { s.a++; }
+
+// Region 06
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE06]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE06]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[10 x [[SA]]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL06:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.sa[3].a)
+  { s.a++; }
+  
+// Region 07
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE07]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE07]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC1111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL07:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.sp[3]->a)
+  { s.a++; }
+
+// Region 08
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE08]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE08]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL08:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.p->a)
+  { s.a++; }
+  
+// Region 09
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE09]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SB]]* [[SEC1111:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL09:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.p->a)
+  { s.a++; }
+
+// Region 10
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE10]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE10]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SA]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL10:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.s.s.b[:2])
+  { s.a++; }
+  
+// Region 11
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE11]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE11]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[10 x i32]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = getelementptr {{.*}}[[SA]]* [[SEC111:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC111]] = load [[SA]]*, [[SA]]** [[SEC1111:%[^,]+]],
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 1
+
+// CK24: call void [[CALL11:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(s.s.p->b[:2])
+  { s.a++; }
+
+// Region 12
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE12]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE12]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SA]]** [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+// CK24-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+// CK24-DAG: [[CBPVAL2]] = bitcast [[SA]]** [[SEC1]] to i8*
+// CK24-DAG: [[CPVAL2]] = bitcast [[SA]]** [[SEC2:%.+]] to i8*
+// CK24-DAG: [[SEC2]] = getelementptr {{.*}}[[SA]]* [[SEC22:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC22]] = load [[SA]]*, [[SA]]** [[SEC222:%[^,]+]],
+// CK24-DAG: [[SEC222]] = getelementptr {{.*}}[[SB]]* [[SEC2222:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC2222]] = load [[SB]]*, [[SB]]** [[SEC22222:%[^,]+]],
+// CK24-DAG: [[SEC22222]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+// CK24-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+// CK24-DAG: [[CBPVAL3]] = bitcast [[SA]]** [[SEC2]] to i8*
+// CK24-DAG: [[CPVAL3]] = bitcast i32* [[SEC3:%.+]] to i8*
+// CK24-DAG: [[SEC3]] = getelementptr {{.*}}[[SA]]* [[SEC33:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC33]] = load [[SA]]*, [[SA]]** [[SEC333:%[^,]+]],
+// CK24-DAG: [[SEC333]] = getelementptr {{.*}}[[SA]]* [[SEC3333:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC3333]] = load [[SA]]*, [[SA]]** [[SEC33333:%[^,]+]],
+// CK24-DAG: [[SEC33333]] = getelementptr {{.*}}[[SB]]* [[SEC333333:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC333333]] = load [[SB]]*, [[SB]]** [[SEC3333333:%[^,]+]],
+// CK24-DAG: [[SEC3333333]] = getelementptr {{.*}}[[SC]]* [[VAR0]], i{{.+}} 0, i{{.+}} 2
+
+// CK24: call void [[CALL12:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(s.p->p->p->a)
+  { s.a++; }
+
+//
+// Same thing but starting from a pointer.
+//
+// Region 13
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE13]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE13]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 0
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL13:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->a)
+  { p->a++; }
+  
+// Region 14
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE14]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE14]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL14:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s)
+  { p->a++; }
+  
+// Region 15
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE15]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE15]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL15:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s.a)
+  { p->a++; }
+
+// Region 16
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE16]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE16]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 3
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL16:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->b[:5])
+  { p->a++; }
+  
+// Region 17
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE17]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE17]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SB]]* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL17:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->p[:5])
+  { p->a++; }
+
+// Region 18
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE18]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE18]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SA]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[10 x [[SA]]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL18:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.sa[3].a)
+  { p->a++; }
+  
+// Region 19
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE19]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE19]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SB]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[10 x [[SA]]*]* [[SEC1111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL19:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.sp[3]->a)
+  { p->a++; }
+
+// Region 20
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE20]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE20]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL20:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->p->a)
+  { p->a++; }
+  
+// Region 21
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE21]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE21]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SA]]* [[SEC11:%[^,]+]], i{{.+}} 0
+// CK24-DAG: [[SEC11]] = load [[SA]]*, [[SA]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SB]]* [[SEC1111:[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL21:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.p->a)
+  { p->a++; }
+
+// Region 22
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE22]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE22]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[10 x i32]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SA]]* [[SEC000:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC000]] = getelementptr {{.*}}[[SB]]* [[SEC0000:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC0000]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL22:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->s.s.b[:2])
+  { p->a++; }
+  
+// Region 23
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE23]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE23]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SA]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SB]]* [[SEC00:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC00]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SA]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[10 x i32]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC11]] = getelementptr {{.*}}[[SA]]* [[SEC111:%[^,]+]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[SEC111]] = load [[SA]]*, [[SA]]** [[SEC1111:%[^,]+]],
+// CK24-DAG: [[SEC1111]] = getelementptr {{.*}}[[SB]]* [[SEC11111:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 1
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL23:@.+]]([[SC]]* {{[^,]+}}) 
+#pragma omp target map(p->s.p->b[:2])
+  { p->a++; }
+
+// Region 24
+// CK24-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 4, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[SIZE24]], {{.+}}getelementptr {{.+}}[4 x i{{.+}}]* [[MTYPE24]]{{.+}})
+// CK24-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK24-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK24-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK24-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK24-DAG: [[CBPVAL0]] = bitcast [[SC]]* [[VAR0:%.+]] to i8*
+// CK24-DAG: [[CPVAL0]] = bitcast [[SB]]** [[SEC0:%.+]] to i8*
+// CK24-DAG: [[SEC0]] = getelementptr {{.*}}[[SC]]* [[VAR00:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK24-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK24-DAG: [[CBPVAL1]] = bitcast [[SB]]** [[SEC0]] to i8*
+// CK24-DAG: [[CPVAL1]] = bitcast [[SA]]** [[SEC1:%.+]] to i8*
+// CK24-DAG: [[SEC1]] = getelementptr {{.*}}[[SB]]* [[SEC11:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC11]] = load [[SB]]*, [[SB]]** [[SEC111:%[^,]+]],
+// CK24-DAG: [[SEC111]] = getelementptr {{.*}}[[SC]]* [[VAR000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP2:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: [[P2:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 2
+// CK24-DAG: store i8* [[CBPVAL2:%[^,]+]], i8** [[BP2]]
+// CK24-DAG: store i8* [[CPVAL2:%[^,]+]], i8** [[P2]]
+// CK24-DAG: [[CBPVAL2]] = bitcast [[SA]]** [[SEC1]] to i8*
+// CK24-DAG: [[CPVAL2]] = bitcast [[SA]]** [[SEC2:%.+]] to i8*
+// CK24-DAG: [[SEC2]] = getelementptr {{.*}}[[SA]]* [[SEC22:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC22]] = load [[SA]]*, [[SA]]** [[SEC222:%[^,]+]],
+// CK24-DAG: [[SEC222]] = getelementptr {{.*}}[[SB]]* [[SEC2222:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC2222]] = load [[SB]]*, [[SB]]** [[SEC22222:%[^,]+]],
+// CK24-DAG: [[SEC22222]] = getelementptr {{.*}}[[SC]]* [[VAR0000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[BP3:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: [[P3:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 3
+// CK24-DAG: store i8* [[CBPVAL3:%[^,]+]], i8** [[BP3]]
+// CK24-DAG: store i8* [[CPVAL3:%[^,]+]], i8** [[P3]]
+// CK24-DAG: [[CBPVAL3]] = bitcast [[SA]]** [[SEC2]] to i8*
+// CK24-DAG: [[CPVAL3]] = bitcast i32* [[SEC3:%.+]] to i8*
+// CK24-DAG: [[SEC3]] = getelementptr {{.*}}[[SA]]* [[SEC33:%[^,]+]], i{{.+}} 0, i{{.+}} 0
+// CK24-DAG: [[SEC33]] = load [[SA]]*, [[SA]]** [[SEC333:%[^,]+]],
+// CK24-DAG: [[SEC333]] = getelementptr {{.*}}[[SA]]* [[SEC3333:%[^,]+]], i{{.+}} 0, i{{.+}} 1
+// CK24-DAG: [[SEC3333]] = load [[SA]]*, [[SA]]** [[SEC33333:%[^,]+]],
+// CK24-DAG: [[SEC33333]] = getelementptr {{.*}}[[SB]]* [[SEC333333:%[^,]+]], i{{.+}} 0, i{{.+}} 4
+// CK24-DAG: [[SEC333333]] = load [[SB]]*, [[SB]]** [[SEC3333333:%[^,]+]],
+// CK24-DAG: [[SEC3333333]] = getelementptr {{.*}}[[SC]]* [[VAR00000:%.+]], i{{.+}} 0, i{{.+}} 2
+
+// CK24-DAG: [[VAR0]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR000]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR0000]] = load [[SC]]*, [[SC]]** %{{.+}}
+// CK24-DAG: [[VAR00000]] = load [[SC]]*, [[SC]]** %{{.+}}
+
+// CK24: call void [[CALL24:@.+]]([[SC]]* {{[^,]+}})
+#pragma omp target map(p->p->p->p->a)
+  { p->a++; }
+
+  return s.a;
+}
+
+// CK24: define {{.+}}[[CALL01]]
+// CK24: define {{.+}}[[CALL02]]
+// CK24: define {{.+}}[[CALL03]]
+// CK24: define {{.+}}[[CALL04]]
+// CK24: define {{.+}}[[CALL05]]
+// CK24: define {{.+}}[[CALL06]]
+// CK24: define {{.+}}[[CALL07]]
+// CK24: define {{.+}}[[CALL08]]
+// CK24: define {{.+}}[[CALL09]]
+// CK24: define {{.+}}[[CALL10]]
+// CK24: define {{.+}}[[CALL11]]
+// CK24: define {{.+}}[[CALL12]]
+// CK24: define {{.+}}[[CALL13]]
+// CK24: define {{.+}}[[CALL14]]
+// CK24: define {{.+}}[[CALL15]]
+// CK24: define {{.+}}[[CALL16]]
+// CK24: define {{.+}}[[CALL17]]
+// CK24: define {{.+}}[[CALL18]]
+// CK24: define {{.+}}[[CALL19]]
+// CK24: define {{.+}}[[CALL20]]
+// CK24: define {{.+}}[[CALL21]]
+// CK24: define {{.+}}[[CALL22]]
+// CK24: define {{.+}}[[CALL23]]
+// CK24: define {{.+}}[[CALL24]]
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK25 -std=c++11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -DCK25 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-64
+// RUN: %clang_cc1 -DCK25 -std=c++11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-32
+// RUN: %clang_cc1 -DCK25 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK25 --check-prefix CK25-32
+#ifdef CK25
+// CK25: [[ST:%.+]] = type { i32, float }
+// CK25: [[CA00:%.+]] = type { [[ST]]* }
+// CK25: [[CA01:%.+]] = type { i32* }
+
+// CK25: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] [i[[Z:64|32]] 4]
+// CK25: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK25: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK25: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 33]
+
+// CK25-LABEL: explicit_maps_with_inner_lambda
+
+template <int X, typename T>
+struct CC {
+  T A;
+  float B;
+
+  int foo(T arg) {
+    // Region 00
+    // CK25-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+    // CK25-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK25-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK25-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK25-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK25-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+    // CK25-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+    // CK25-DAG: [[SEC0]] = getelementptr {{.*}}[[ST]]* [[VAR0:%.+]], i{{.+}} 0, i{{.+}} 0
+
+    // CK25: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}})
+    #pragma omp target map(to:A)
+    {
+      [&]() {
+        A += 1;
+      }();
+    }
+
+    // Region 01
+    // CK25-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+    // CK25-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+    // CK25-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+    // CK25-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+    // CK25-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+    // CK25-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+    // CK25-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+    // CK25-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+    // CK25: call void [[CALL01:@.+]](i32* {{[^,]+}})
+    #pragma omp target map(to:arg)
+    {
+      [&]() {
+        arg += 1;
+      }();
+    }
+
+    return A+arg;
+  }
+};
+
+int explicit_maps_with_inner_lambda(int a){
+  CC<123,int> c;
+  return c.foo(a);
+}
+
+// CK25: define {{.+}}[[CALL00]]([[ST]]* [[VAL:%.+]])
+// CK25: store [[ST]]* [[VAL]], [[ST]]** [[VALADDR:%[^,]+]],
+// CK25: [[VAL1:%.+]] = load [[ST]]*, [[ST]]** [[VALADDR]],
+// CK25: [[VALADDR1:%.+]] = getelementptr inbounds [[CA00]], [[CA00]]* [[CA:%[^,]+]], i32 0, i32 0
+// CK25: store [[ST]]* [[VAL1]], [[ST]]** [[VALADDR1]],
+// CK25: call void {{.*}}[[LAMBDA:@.+]]{{.*}}([[CA00]]* [[CA]])
+
+// CK25: define {{.+}}[[LAMBDA]]
+
+// CK25: define {{.+}}[[CALL01]](i32* {{.*}}[[VAL:%.+]])
+// CK25: store i32* [[VAL]], i32** [[VALADDR:%[^,]+]],
+// CK25: [[VAL1:%.+]] = load i32*, i32** [[VALADDR]],
+// CK25: [[VALADDR1:%.+]] = getelementptr inbounds [[CA01]], [[CA01]]* [[CA:%[^,]+]], i32 0, i32 0
+// CK25: store i32* [[VAL1]], i32** [[VALADDR1]],
+// CK25: call void {{.*}}[[LAMBDA]]{{.*}}([[CA01]]* [[CA]])
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK26 -std=c++11 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -std=c++11 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-64
+// RUN: %clang_cc1 -DCK26 -std=c++11 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+// RUN: %clang_cc1 -DCK26 -std=c++11 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK26 --check-prefix CK26-32
+#ifdef CK26
+// CK26: [[ST:%.+]] = type { i32, float*, i32, float* }
+
+// CK26: [[SIZE00:@.+]] = private {{.*}}constant [2 x i[[Z:64|32]]] [i[[Z:64|32]] {{32|16}}, i[[Z:64|32]] 4]
+// CK26: [[MTYPE00:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE01:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE01:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE02:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE02:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26: [[SIZE03:@.+]] = private {{.*}}constant [2 x i[[Z]]] [i[[Z]] {{32|16}}, i[[Z]] 4]
+// CK26: [[MTYPE03:@.+]] = private {{.*}}constant [2 x i32] [i32 35, i32 35]
+
+// CK26-LABEL: explicit_maps_with_private_class_members
+
+struct CC {
+  int fA;
+  float &fB;
+  int pA;
+  float &pB;
+
+  CC(float &B) : fB(B), pB(B) {
+
+    // CK26: call {{.*}}@__kmpc_fork_call{{.*}} [[OUTCALL:@.+]] to void (i32*, i32*, ...)*
+    // define {{.*}}void [[OUTCALL]]
+    #pragma omp parallel firstprivate(fA,fB) private(pA,pB)
+    {
+      // Region 00
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load i32*, i32** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load i32*, i32** [[PVT]],
+
+      // CK26: call void [[CALL00:@.+]]([[ST]]* {{[^,]+}}, i32* {{[^,]+}})
+      #pragma omp target map(fA)
+      {
+        ++fA;
+      }
+
+      // Region 01
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE01]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast float* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load float*, float** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load float*, float** [[PVT]],
+
+      // CK26: call void [[CALL01:@.+]]([[ST]]* {{[^,]+}}, float* {{[^,]+}})
+      #pragma omp target map(fB)
+      {
+        fB += 1.0;
+      }
+
+      // Region 02
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE02]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast i32* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast i32* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load i32*, i32** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load i32*, i32** [[PVT]],
+
+      // CK26: call void [[CALL02:@.+]]([[ST]]* {{[^,]+}}, i32* {{[^,]+}})
+      #pragma omp target map(pA)
+      {
+        ++pA;
+      }
+
+      // Region 01
+      // CK26-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE03]]{{.+}})
+      // CK26-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+      // CK26-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+      // CK26-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+      // CK26-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+      // CK26-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+      // CK26-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+      // CK26-DAG: [[CPVAL0]] = bitcast [[ST]]* [[VAR0]] to i8*
+
+      // CK26-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+      // CK26-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+      // CK26-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+      // CK26-DAG: [[CBPVAL1]] = bitcast float* [[VAR1:%.+]] to i8*
+      // CK26-DAG: [[CPVAL1]] = bitcast float* [[SEC1:%.+]] to i8*
+      // CK26-DAG: [[VAR1]] = load float*, float** [[PVT:%.+]],
+      // CK26-DAG: [[SEC1]] = load float*, float** [[PVT]],
+
+      // CK26: call void [[CALL03:@.+]]([[ST]]* {{[^,]+}}, float* {{[^,]+}})
+      #pragma omp target map(pB)
+      {
+        pB += 1.0;
+      }
+    }
+  }
+
+  int foo() {
+    return fA + pA;
+  }
+};
+
+// Make sure the private instance is used in all target regions.
+// CK26: define {{.+}}[[CALL00]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
+// CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
+// CK26: add nsw i32 [[VAL]], 1
+
+// CK26: define {{.+}}[[CALL01]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
+// CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
+// CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
+// CK26: fadd double [[EXT]], 1.000000e+00
+
+// CK26: define {{.+}}[[CALL02]]({{.*}}i32*{{.*}}[[PVTARG:%.+]])
+// CK26: store i32* [[PVTARG]], i32** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load i32*, i32** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load i32, i32* [[ADDR]],
+// CK26: add nsw i32 [[VAL]], 1
+
+// CK26: define {{.+}}[[CALL03]]({{.*}}float*{{.*}}[[PVTARG:%.+]])
+// CK26: store float* [[PVTARG]], float** [[PVTADDR:%.+]],
+// CK26: [[ADDR:%.+]] = load float*, float** [[PVTADDR]],
+// CK26: [[VAL:%.+]] = load float, float* [[ADDR]],
+// CK26: [[EXT:%.+]] = fpext float [[VAL]] to double
+// CK26: fadd double [[EXT]], 1.000000e+00
+
+int explicit_maps_with_private_class_members(){
+  float B;
+  CC c(B);
+  return c.foo();
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK27 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK27 --check-prefix CK27-64
+// RUN: %clang_cc1 -DCK27 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-64
+// RUN: %clang_cc1 -DCK27 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-32
+// RUN: %clang_cc1 -DCK27 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK27 --check-prefix CK27-32
+#ifdef CK27
+
+// CK27: [[SIZE00:@.+]] = private {{.*}}constant [1 x i[[Z:64|32]]] zeroinitializer
+// CK27: [[MTYPE00:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK27: [[SIZE01:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE01:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE02:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE02:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE03:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE03:@.+]] = private {{.*}}constant [1 x i32] [i32 35]
+
+// CK27: [[SIZE05:@.+]] = private {{.*}}constant [1 x i[[Z]]] zeroinitializer
+// CK27: [[MTYPE05:@.+]] = private {{.*}}constant [1 x i32] [i32 32]
+
+// CK27: [[SIZE07:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 4]
+// CK27: [[MTYPE07:@.+]] = private {{.*}}constant [1 x i32] [i32 288]
+
+// CK27: [[SIZE09:@.+]] = private {{.*}}constant [1 x i[[Z]]] [i[[Z]] 40]
+// CK27: [[MTYPE09:@.+]] = private {{.*}}constant [1 x i32] [i32 161]
+
+// CK27-LABEL: zero_size_section_and_private_maps
+void zero_size_section_and_private_maps (int ii){
+
+  // Map of a pointer.
+  int *pa;
+
+  // Region 00
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL00:@.+]](i32* {{[^,]+}})
+  #pragma omp target
+  {
+    pa[50]++;
+  }
+
+  // Region 01
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE01]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE01]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL01:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[:0])
+  {
+    pa[50]++;
+  }
+
+  // Region 02
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} 0
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL02:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[0:0])
+  {
+    pa[50]++;
+  }
+
+  // Region 03
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE03]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[RVAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[SEC0:%.+]] to i8*
+  // CK27-DAG: [[RVAR0]] = load i32*, i32** [[VAR0:%[^,]+]]
+  // CK27-DAG: [[SEC0]] = getelementptr {{.*}}i32* [[RVAR00:%.+]], i{{.+}} %{{.+}}
+  // CK27-DAG: [[RVAR00]] = load i32*, i32** [[VAR0]]
+
+  // CK27: call void [[CALL03:@.+]](i32* {{[^,]+}})
+  #pragma omp target map(pa[ii:0])
+  {
+    pa[50]++;
+  }
+
+  int *pvtPtr;
+  int pvtScl;
+  int pvtArr[10];
+
+  // Region 04
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL04:@.+]]()
+  #pragma omp target private(pvtPtr)
+  {
+    pvtPtr[5]++;
+  }
+
+  // Region 05
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE05]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE05]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL05:@.+]](i32* {{[^,]+}})
+  #pragma omp target firstprivate(pvtPtr)
+  {
+    pvtPtr[5]++;
+  }
+
+  // Region 06
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL06:@.+]]()
+  #pragma omp target private(pvtScl)
+  {
+    pvtScl++;
+  }
+
+  // Region 07
+  // CK27-DAG: call i32 @__tgt_target(i32 {{.+}}, i8* {{.+}}, i32 1, i8** [[BPGEP:%[0-9]+]], i8** [[PGEP:%[0-9]+]], {{.+}}[[SIZE07]]{{.+}}, {{.+}}[[MTYPE07]]{{.+}})
+  // CK27-DAG: [[BPGEP]] = getelementptr inbounds {{.+}}[[BPS:%[^,]+]], i32 0, i32 0
+  // CK27-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
+  // CK27-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
+  // CK27-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
+  // CK27-DAG: store i8* [[VALBP:%.+]], i8** [[BP1]],
+  // CK27-DAG: store i8* [[VALP:%.+]], i8** [[P1]],
+  // CK27-DAG: [[VALBP]] = inttoptr i[[Z]] [[VAL:%.+]] to i8*
+  // CK27-DAG: [[VALP]] = inttoptr i[[Z]] [[VAL:%.+]] to i8*
+  // CK27-DAG: [[VAL]] = load i[[Z]], i[[Z]]* [[ADDR:%.+]],
+  // CK27-64-DAG: [[CADDR:%.+]] = bitcast i[[Z]]* [[ADDR]] to i32*
+  // CK27-64-DAG: store i32 {{.+}}, i32* [[CADDR]],
+
+  // CK27: call void [[CALL07:@.+]](i[[Z]] [[VAL]])
+  #pragma omp target firstprivate(pvtScl)
+  {
+    pvtScl++;
+  }
+
+  // Region 08
+  // CK27: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 0, i8** null, i8** null, i{{64|32}}* null, i32* null)
+  // CK27: call void [[CALL08:@.+]]()
+  #pragma omp target private(pvtArr)
+  {
+    pvtArr[5]++;
+  }
+
+  // Region 09
+  // CK27-DAG: call i32 @__tgt_target(i32 {{[^,]+}}, i8* {{[^,]+}}, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE09]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE09]]{{.+}})
+  // CK27-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK27-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK27-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK27-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK27-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK27-DAG: [[CBPVAL0]] = bitcast [10 x i32]* [[VAR0:%.+]] to i8*
+  // CK27-DAG: [[CPVAL0]] = bitcast [10 x i32]* [[VAR0]] to i8*
+
+  // CK27: call void [[CALL09:@.+]]([10 x i32]* {{[^,]+}})
+  #pragma omp target firstprivate(pvtArr)
+  {
+    pvtArr[5]++;
+  }
+}
+
+// CK27: define {{.+}}[[CALL00]]
+// CK27: define {{.+}}[[CALL01]]
+// CK27: define {{.+}}[[CALL02]]
+// CK27: define {{.+}}[[CALL03]]
+// CK27: define {{.+}}[[CALL04]]
+// CK27: define {{.+}}[[CALL05]]
+// CK27: define {{.+}}[[CALL06]]
+// CK27: define {{.+}}[[CALL07]]
+#endif
 #endif
diff --git a/test/OpenMP/target_map_messages.cpp b/test/OpenMP/target_map_messages.cpp
index d61e766c6b4b2..543f47f8216a8 100644
--- a/test/OpenMP/target_map_messages.cpp
+++ b/test/OpenMP/target_map_messages.cpp
@@ -1,5 +1,299 @@
-// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 200 %s
+// RUN: %clang_cc1 -DCCODE -verify -fopenmp -ferror-limit 200 -x c %s
+#ifdef CCODE
+void foo(int arg) {
+  const int n = 0;
 
+  double marr[10][10][10];
+
+  #pragma omp target map(marr[2][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][0:][:])
+  {}
+  #pragma omp target map(marr[:][1:][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][n:][:])
+  {}
+}
+#else
+template <typename T, int I>
+struct SA {
+  static int ss;
+  #pragma omp threadprivate(ss) // expected-note {{defined as threadprivate or thread local}}
+  float a;
+  int b[12];
+  float *c;
+  T d;
+  float e[I];
+  T *f;
+  void func(int arg) {
+    #pragma omp target map(arg,a,d)
+    {}
+    #pragma omp target map(arg[2:2],a,d) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+    #pragma omp target map(arg,a*2) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+    {}
+    #pragma omp target map(arg,(c+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+    {}
+    #pragma omp target map(arg,a[:2],d) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+    #pragma omp target map(arg,a,d[:2]) // expected-error {{subscripted value is not an array or pointer}}
+    {}
+
+    #pragma omp target map(to:ss) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+    {}
+
+    #pragma omp target map(to:b,e)
+    {}
+    #pragma omp target map(to:b,e) map(to:b) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+    {}
+    #pragma omp target map(to:b[:2],e)
+    {}
+    #pragma omp target map(to:b,e[:])
+    {}
+
+    #pragma omp target map(always, tofrom: c,f)
+    {}
+    #pragma omp target map(always, tofrom: c[1:2],f)
+    {}
+    #pragma omp target map(always, tofrom: c,f[1:2])
+    {}
+    #pragma omp target map(always, tofrom: c[:],f)   // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+    {}
+    #pragma omp target map(always, tofrom: c,f[:])   // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+    {}
+    return;
+  }
+};
+
+struct SB {
+  unsigned A;
+  unsigned B;
+  float Arr[100];
+  float *Ptr;
+  float *foo() {
+    return &Arr[0];
+  }
+};
+
+struct SC {
+  unsigned A : 2;
+  unsigned B : 3;
+  unsigned C;
+  unsigned D;
+  float Arr[100];
+  SB S;
+  SB ArrS[100];
+  SB *PtrS;
+  SB *&RPtrS;
+  float *Ptr;
+
+  SC(SB *&_RPtrS) : RPtrS(_RPtrS) {}
+};
+
+union SD {
+  unsigned A;
+  float B;
+};
+
+void SAclient(int arg) {
+  SA<int,123> s;
+  s.func(arg); // expected-note {{in instantiation of member function}}
+  double marr[10][10][10];
+  double marr2[5][10][1];
+  double mvla[5][arg][10];
+  double ***mptr;
+  const int n = 0;
+  const int m = 1;
+  double mvla2[5][arg][m+n+10];
+
+  SB *p;
+
+  SD u;
+  SC r(p),t(p);
+  #pragma omp target map(r)
+  {}
+  #pragma omp target map(marr[2][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:][0:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[2][3][0:2])
+  {}
+  #pragma omp target map(marr[:][:][:])
+  {}
+  #pragma omp target map(marr[:2][:][:])
+  {}
+  #pragma omp target map(marr[arg:][:][:])
+  {}
+  #pragma omp target map(marr[arg:])
+  {}
+  #pragma omp target map(marr[arg:][:arg][:]) // correct if arg is the size of dimension 2
+  {}
+  #pragma omp target map(marr[:arg][:])
+  {}
+  #pragma omp target map(marr[:arg][n:])
+  {}
+  #pragma omp target map(marr[:][:arg][n:]) // correct if arg is the size of  dimension 2
+  {}
+  #pragma omp target map(marr[:][:m][n:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[n:m][:arg][n:])
+  {}
+  #pragma omp target map(marr[:2][:1][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][1:][:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:][:1]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:][1:]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:1][:2][:])
+  {}
+  #pragma omp target map(marr[:1][0][:])
+  {}
+  #pragma omp target map(marr[:arg][:2][:]) // correct if arg is 1
+  {}
+  #pragma omp target map(marr[:1][3:1][:2])
+  {}
+  #pragma omp target map(marr[:1][3:arg][:2]) // correct if arg is 1
+  {}
+  #pragma omp target map(marr[:1][3:2][:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr[:2][:10][:])
+  {}
+  #pragma omp target map(marr[:2][:][:5+5])
+  {}
+  #pragma omp target map(marr[:2][2+2-4:][0:5+5])
+  {}
+
+  #pragma omp target map(marr[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(marr2[:1][:2][0])
+  {}
+
+  #pragma omp target map(mvla[:1][:][0]) // correct if the size of dimension 2 is 1.
+  {}
+  #pragma omp target map(mvla[:2][:arg][:]) // correct if arg is the size of dimension 2.
+  {}
+  #pragma omp target map(mvla[:1][:2][0]) // expected-error {{array section does not specify contiguous storage}}
+   {}
+  #pragma omp target map(mvla[1][2:arg][:])
+  {}
+  #pragma omp target map(mvla[:1][:][:])
+  {}
+  #pragma omp target map(mvla2[:1][:2][:11])
+  {}
+  #pragma omp target map(mvla2[:1][:2][:10]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+
+  #pragma omp target map(mptr[:2][2+2-4:1][0:5+5]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(mptr[:1][:2-1][2:4-3])
+  {}
+  #pragma omp target map(mptr[:1][:arg][2:4-3]) // correct if arg is 1.
+  {}
+  #pragma omp target map(mptr[:1][:2-1][0:2])
+  {}
+  #pragma omp target map(mptr[:1][:2][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+  #pragma omp target map(mptr[:1][:][0:2]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  {}
+  #pragma omp target map(mptr[:2][:1][0:2]) // expected-error {{array section does not specify contiguous storage}}
+  {}
+
+  #pragma omp target map(r.ArrS[0].B)
+  {}
+  #pragma omp target map(r.ArrS[:1].B) // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[:arg].B) // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[1:23])
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[1:arg])
+  {}
+  #pragma omp target map(r.ArrS[0].Arr[arg:23])
+  {}
+  #pragma omp target map(r.ArrS[0].Error) // expected-error {{no member named 'Error' in 'SB'}}
+  {}
+  #pragma omp target map(r.ArrS[0].A, r.ArrS[1].A) // expected-error {{multiple array elements associated with the same variable are not allowed in map clauses of the same construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.ArrS[0].A, t.ArrS[1].A)
+  {}
+  #pragma omp target map(r.PtrS[0], r.PtrS->B) // expected-error {{same pointer derreferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.RPtrS[0], r.RPtrS->B) // expected-error {{same pointer derreferenced in multiple different ways in map clause expressions}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.S.Arr[:12])
+  {}
+  #pragma omp target map(r.S.foo()[:12]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  {}
+  #pragma omp target map(r.C, r.D)
+  {}
+  #pragma omp target map(r.C, r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C) map(r.C) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C, r.S)  // this would be an error only caught at runtime - Sema would have to make sure there is not way for the missing data between fields to be mapped somewhere else.
+  {}
+  #pragma omp target map(r, r.S)  // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  {}
+  #pragma omp target map(r.C, t.C)
+  {}
+  #pragma omp target map(r.A)   // expected-error {{bit fields cannot be used to specify storage in a 'map' clause}}
+  {}
+  #pragma omp target map(r.Arr)
+  {}
+  #pragma omp target map(r.Arr[3:5])
+  {}
+  #pragma omp target map(r.Ptr[3:5])
+  {}
+  #pragma omp target map(r.ArrS[3:5].A)   // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[3:5].Arr[6:7])   // expected-error {{OpenMP array section is not allowed here}}
+  {}
+  #pragma omp target map(r.ArrS[3].Arr[6:7])
+  {}
+  #pragma omp target map(r.S.Arr[4:5])
+  {}
+  #pragma omp target map(r.S.Ptr[4:5])
+  {}
+  #pragma omp target map(r.S.Ptr[:])  // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  {}
+  #pragma omp target map((p+1)->A)  // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  {}
+  #pragma omp target map(u.B)  // expected-error {{mapped storage cannot be derived from a union}}
+  {}
+
+  #pragma omp target data map(to: r.C) //expected-note {{used here}}
+  {
+    #pragma omp target map(r.D)  // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+    {}
+  }
+
+  #pragma omp target data map(to: t.Ptr) //expected-note {{used here}}
+  {
+    #pragma omp target map(t.Ptr[:23])  // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+    {}
+  }
+
+  #pragma omp target data map(to: t.C, t.D)
+  {
+  #pragma omp target data map(to: t.C)
+  {
+    #pragma omp target map(t.D)
+    {}
+  }
+  }
+
+  #pragma omp target data map(to: t)
+  {
+  #pragma omp target data map(to: t.C)
+  {
+    #pragma omp target map(t.D)
+    {}
+  }
+  }
+}
 void foo() {
 }
 
@@ -62,25 +356,31 @@ T tmain(T argc) {
   T y;
   T to, tofrom, always;
   const T (&l)[5] = da;
-
-
 #pragma omp target map // expected-error {{expected '(' after 'map'}}
+  {}
 #pragma omp target map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  {}
 #pragma omp target map() // expected-error {{expected expression}}
+  {}
 #pragma omp target map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  {}
 #pragma omp target map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  {}
 #pragma omp target map(to:) // expected-error {{expected expression}}
+  {}
 #pragma omp target map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  {}
 #pragma omp target map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  {}
 #pragma omp target map(x)
   foo();
 #pragma omp target map(tofrom: t[:I])
   foo();
-#pragma omp target map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
   foo();
 #pragma omp target map(T) // expected-error {{'T' does not refer to a value}}
   foo();
-#pragma omp target map(I) // expected-error 2 {{expected variable name, array element or array section}}
+#pragma omp target map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
   foo();
 #pragma omp target map(S2::S2s)
   foo();
@@ -96,42 +396,41 @@ T tmain(T argc) {
   foo();
 #pragma omp target map(to, x)
   foo();
-#pragma omp target map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected variable name, array element or array section}} 
-#pragma omp target map(argc)
-#pragma omp target map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ca)
-#pragma omp target map(da)
-#pragma omp target map(S2::S2s)
-#pragma omp target map(S2::S2sc)
-#pragma omp target map(e, g)
-#pragma omp target map(h) // expected-error {{threadprivate variables are not allowed in map clause}}
-#pragma omp target map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
-#pragma omp target map(k), map(k[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+#pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target data map(argc)
+#pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ca)
+#pragma omp target data map(da)
+#pragma omp target data map(S2::S2s)
+#pragma omp target data map(S2::S2sc)
+#pragma omp target data map(e, g)
+#pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+#pragma omp target data map(k) map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+#pragma omp target map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
   foo();
-#pragma omp target map(da)
+#pragma omp target data map(da)
 #pragma omp target map(da[:4])
   foo();
-#pragma omp target map(k, j, l) // expected-note 4 {{used here}}
-#pragma omp target map(k[:4]) // expected-error 2 {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target data map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
   foo();
-#pragma omp target map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
-#pragma omp target map(k) // expected-error 2 {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l) // expected-error 2 {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
+#pragma omp target data map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) // expected-error 2 {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
   foo();
 
-#pragma omp target map(always, tofrom: x)
-#pragma omp target map(always: x) // expected-error {{missing map type}}
-#pragma omp target map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
-#pragma omp target map(always, tofrom: always, tofrom, x)
+#pragma omp target data map(always, tofrom: x)
+#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
-
   return 0;
 }
 
@@ -147,14 +446,14 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target map // expected-error {{expected '(' after 'map'}}
-#pragma omp target map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
-#pragma omp target map() // expected-error {{expected expression}}
-#pragma omp target map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
-#pragma omp target map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(to:) // expected-error {{expected expression}}
-#pragma omp target map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
-#pragma omp target map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+#pragma omp target data map() // expected-error {{expected expression}}
+#pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+#pragma omp target data map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(to:) // expected-error {{expected expression}}
+#pragma omp target data map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target data map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
 #pragma omp target map(x)
   foo();
 #pragma omp target map(to: x)
@@ -165,43 +464,46 @@ int main(int argc, char **argv) {
   foo();
 #pragma omp target map(to, x)
   foo();
-#pragma omp target map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
-#pragma omp target map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
-#pragma omp target map(argc)
-#pragma omp target map(S1) // expected-error {{'S1' does not refer to a value}}
-#pragma omp target map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(argv[1])
-#pragma omp target map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
-#pragma omp target map(ca)
-#pragma omp target map(da)
-#pragma omp target map(S2::S2s)
-#pragma omp target map(S2::S2sc)
-#pragma omp target map(e, g)
-#pragma omp target map(h) // expected-error {{threadprivate variables are not allowed in map clause}}
-#pragma omp target map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
-#pragma omp target map(k), map(k[:5]) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target data map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+#pragma omp target data map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{xpected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target data map(argc)
+#pragma omp target data map(S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target data map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(argv[1])
+#pragma omp target data map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target data map(ca)
+#pragma omp target data map(da)
+#pragma omp target data map(S2::S2s)
+#pragma omp target data map(S2::S2sc)
+#pragma omp target data map(e, g)
+#pragma omp target data map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+#pragma omp target data map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+#pragma omp target map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
   foo();
-#pragma omp target map(da)
+#pragma omp target data map(da)
 #pragma omp target map(da[:4])
   foo();
-#pragma omp target map(k, j, l) // expected-note 2 {{used here}}
-#pragma omp target map(k[:4]) // expected-error {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l[:5]) // expected-error {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target data map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) map(l[:5]) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
   foo();
-#pragma omp target map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
-#pragma omp target map(k) // expected-error {{variable already marked as mapped in current construct}}
-#pragma omp target map(j)
-#pragma omp target map(l) // expected-error {{variable already marked as mapped in current construct}}
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+#pragma omp target data map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+#pragma omp target data map(j)
+#pragma omp target map(l) // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
   foo();
 
-#pragma omp target map(always, tofrom: x)
-#pragma omp target map(always: x) // expected-error {{missing map type}}
-#pragma omp target map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
-#pragma omp target map(always, tofrom: always, tofrom, x)
+#pragma omp target data map(always, tofrom: x)
+#pragma omp target data map(always: x) // expected-error {{missing map type}}
+#pragma omp target data map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+#pragma omp target data map(always, tofrom: always, tofrom, x)
 #pragma omp target map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
   foo();
-
+#pragma omp target private(j) map(j) // expected-error {{private variable cannot be in a map clause in '#pragma omp target' directive}}  expected-note {{defined as private}}
+  {}
+#pragma omp target firstprivate(j) map(j)  // expected-error {{firstprivate variable cannot be in a map clause in '#pragma omp target' directive}} expected-note {{defined as firstprivate}}
+  {}
   return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
 }
-
+#endif
diff --git a/test/OpenMP/target_messages.cpp b/test/OpenMP/target_messages.cpp
index 86a91838ce021..6f79f44627fa4 100644
--- a/test/OpenMP/target_messages.cpp
+++ b/test/OpenMP/target_messages.cpp
@@ -1,6 +1,9 @@
 // RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
-// RUN: not %clang_cc1 -fopenmp -std=c++11 -omptargets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
 // CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx64-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -triple nvptx-nvidia-cuda -o - %s 2>&1 | FileCheck --check-prefix CHECK-UNSUPPORTED-HOST-TARGET %s
+// CHECK-UNSUPPORTED-HOST-TARGET: error: The target '{{nvptx64-nvidia-cuda|nvptx-nvidia-cuda}}' is not a supported OpenMP host target.
 
 void foo() {
 }
@@ -21,6 +24,7 @@ int main(int argc, char **argv) {
   #pragma omp target } // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   foo();
   #pragma omp target
+  foo();
   // expected-warning@+1 {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target unknown()
   foo();
diff --git a/test/OpenMP/target_nowait_messages.cpp b/test/OpenMP/target_nowait_messages.cpp
new file mode 100644
index 0000000000000..7531c81c0f6ae
--- /dev/null
+++ b/test/OpenMP/target_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target nowait( // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+  #pragma omp target nowait device (-10u)
+  foo();
+  #pragma omp target nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_ast_print.cpp b/test/OpenMP/target_parallel_ast_print.cpp
new file mode 100644
index 0000000000000..1c0fca5ccfc06
--- /dev/null
+++ b/test/OpenMP/target_parallel_ast_print.cpp
@@ -0,0 +1,233 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T>
+struct S {
+  operator T() {return T();}
+  static T TS;
+  #pragma omp threadprivate(TS)
+};
+
+// CHECK:      template <class T = int> struct S {
+// CHECK:        static int TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S<int>::TS)
+// CHECK-NEXT: }
+// CHECK:      template <class T = char> struct S {
+// CHECK:        static char TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S<char>::TS)
+// CHECK-NEXT: }
+// CHECK:      template <class T> struct S {
+// CHECK:        static T TS;
+// CHECK-NEXT:   #pragma omp threadprivate(S::TS)
+// CHECK:      };
+
+template <typename T, int C>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, g;
+  static T h;
+  S<T> s;
+  T arr[C][10], arr1[C];
+  T i, j, a[20];
+#pragma omp target parallel
+  h=2;
+#pragma omp target parallel default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(C) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:C][0:10])
+  foo();
+#pragma omp target parallel if (C) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(&& : g)
+  foo();
+#pragma omp target parallel if (target:argc > 0)
+  foo();
+#pragma omp target parallel if (parallel:argc > 0)
+  foo();
+#pragma omp target parallel if (C)
+  foo();
+#pragma omp target parallel map(i)
+  foo();
+#pragma omp target parallel map(a[0:10], i)
+  foo();
+#pragma omp target parallel map(to: i) map(from: j)
+  foo();
+#pragma omp target parallel map(always,alloc: i)
+  foo();
+#pragma omp target parallel nowait
+  foo();
+#pragma omp target parallel depend(in : argc, argv[i:argc], a[:])
+  foo();
+#pragma omp target parallel defaultmap(tofrom: scalar)
+  foo();
+  return 0;
+}
+
+// CHECK: template <typename T = int, int C = 5> int tmain(int argc, int *argv) {
+// CHECK-NEXT: int b = argc, c, d, e, f, g;
+// CHECK-NEXT: static int h;
+// CHECK-NEXT: S<int> s;
+// CHECK-NEXT: int arr[5][10], arr1[5];
+// CHECK-NEXT: int i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(5)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+// CHECK: template <typename T = char, int C = 1> char tmain(char argc, char *argv) {
+// CHECK-NEXT: char b = argc, c, d, e, f, g;
+// CHECK-NEXT: static char h;
+// CHECK-NEXT: S<char> s;
+// CHECK-NEXT: char arr[1][10], arr1[1];
+// CHECK-NEXT: char i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(1) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(1) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(1)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+// CHECK: template <typename T, int C> T tmain(T argc, T *argv) {
+// CHECK-NEXT: T b = argc, c, d, e, f, g;
+// CHECK-NEXT: static T h;
+// CHECK-NEXT: S<T> s;
+// CHECK-NEXT: T arr[C][10], arr1[C];
+// CHECK-NEXT: T i, j, a[20]
+// CHECK-NEXT: #pragma omp target parallel
+// CHECK-NEXT: h = 2;
+// CHECK-NEXT: #pragma omp target parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(C) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(C) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(&&: g)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel if(C)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel nowait
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+// CHECK-NEXT: foo()
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: foo()
+
+// CHECK-LABEL: int main(int argc, char **argv) {
+int main (int argc, char **argv) {
+  int i, j, a[20];
+// CHECK-NEXT: int i, j, a[20]
+#pragma omp target parallel
+// CHECK-NEXT: #pragma omp target parallel
+  foo();
+// CHECK-NEXT: foo();
+#pragma omp target parallel if (argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel if (target: argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(target: argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel if (parallel: argc > 0)
+// CHECK-NEXT: #pragma omp target parallel if(parallel: argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(i) if(argc>0)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i) if(argc > 0)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(i)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(a[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel map(tofrom: a[0:10],i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel map(to: i) map(from: j)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel map(always,alloc: i)
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel nowait
+// CHECK-NEXT: #pragma omp target parallel nowait
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel depend(in : argc, argv[i:argc], a[:])
+// CHECK-NEXT: #pragma omp target parallel depend(in : argc,argv[i:argc],a[:])
+  foo();
+// CHECK-NEXT: foo();
+
+#pragma omp target parallel defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel defaultmap(tofrom: scalar)
+  foo();
+// CHECK-NEXT: foo();
+
+  return tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]);
+}
+
+extern template int S<int>::TS;
+extern template char S<char>::TS;
+
+#endif
diff --git a/test/OpenMP/target_parallel_default_messages.cpp b/test/OpenMP/target_parallel_default_messages.cpp
new file mode 100644
index 0000000000000..40f31b84848ae
--- /dev/null
+++ b/test/OpenMP/target_parallel_default_messages.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel default // expected-error {{expected '(' after 'default'}}
+  foo();
+  #pragma omp target parallel default ( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel default () // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  foo();
+  #pragma omp target parallel default (none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel default (shared), default(shared) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'default' clause}}
+  foo();
+  #pragma omp target parallel default (x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  foo();
+
+  #pragma omp target parallel default(none)
+  ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  #pragma omp target parallel default(none)
+  foo();
+  #pragma omp target parallel default(shared)
+  ++argc;
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_defaultmap_messages.cpp b/test/OpenMP/target_parallel_defaultmap_messages.cpp
new file mode 100644
index 0000000000000..49e7c30a7d15f
--- /dev/null
+++ b/test/OpenMP/target_parallel_defaultmap_messages.cpp
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_depend_messages.cpp b/test/OpenMP/target_parallel_depend_messages.cpp
new file mode 100644
index 0000000000000..a9300ca6adb2b
--- /dev/null
+++ b/test/OpenMP/target_parallel_depend_messages.cpp
@@ -0,0 +1,89 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+
+  #pragma omp target parallel depend // expected-error {{expected '(' after 'depend'}}
+  foo();
+  #pragma omp target parallel depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  foo();
+  #pragma omp target parallel depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  foo();
+  #pragma omp target parallel depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel depend (out: ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : argv[0])
+  foo();
+  #pragma omp target parallel depend (in : ) // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  foo();
+  #pragma omp target parallel depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  foo();
+  #pragma omp target parallel depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target parallel depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  foo();
+  #pragma omp target parallel depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target parallel depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  foo();
+  #pragma omp target parallel depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  foo();
+  #pragma omp target parallel depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  foo();
+  #pragma omp target parallel depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  foo();
+  #pragma omp target parallel depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  foo();
+  #pragma omp target parallel depend(in : argv[ : argc][1 : argc - 1])
+  foo();
+  #pragma omp target parallel depend(in : arr[0])
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_device_messages.cpp b/test/OpenMP/target_parallel_device_messages.cpp
new file mode 100644
index 0000000000000..6c8d4c2e43ab2
--- /dev/null
+++ b/test/OpenMP/target_parallel_device_messages.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel device // expected-error {{expected '(' after 'device'}}
+  foo();
+  #pragma omp target parallel device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel device () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel device (argc + argc)
+  foo();
+  #pragma omp target parallel device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'device' clause}}
+  foo();
+  #pragma omp target parallel device (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  foo();
+  #pragma omp target parallel device (-10u)
+  foo();
+  #pragma omp target parallel device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_firstprivate_messages.cpp b/test/OpenMP/target_parallel_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..dd6825aa251f9
--- /dev/null
+++ b/test/OpenMP/target_parallel_firstprivate_messages.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note{{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(const S2 &s2):a(s2.a) { }
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(const S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note {{implicitly declared private here}}
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { } // expected-note {{implicitly declared private here}}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  static int m;
+  #pragma omp target parallel firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  foo();
+  #pragma omp target parallel firstprivate ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel firstprivate (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel firstprivate (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel firstprivate (argc)
+  foo();
+  #pragma omp target parallel firstprivate (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel firstprivate (a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  foo();
+  #pragma omp target parallel firstprivate (argv[1]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel firstprivate(ba)
+  foo();
+  #pragma omp target parallel firstprivate(ca)
+  foo();
+  #pragma omp target parallel firstprivate(da)
+  foo();
+  #pragma omp target parallel firstprivate(S2::S2s)
+  foo();
+  #pragma omp target parallel firstprivate(S2::S2sc)
+  foo();
+  #pragma omp target parallel firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  foo();
+  #pragma omp target parallel firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  foo();
+  #pragma omp target parallel private(i), firstprivate(i) // expected-error {{private variable cannot be firstprivate}} expected-note{{defined as private}}
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel firstprivate(i)
+  foo();
+  #pragma omp target parallel firstprivate(j)
+  foo();
+  #pragma omp target parallel firstprivate(m)
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_ast_print.cpp b/test/OpenMP/target_parallel_for_ast_print.cpp
new file mode 100644
index 0000000000000..6c551cbb9abfe
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_ast_print.cpp
@@ -0,0 +1,252 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target parallel for private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target parallel for private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, h;
+  T arr[N][10], arr1[N];
+  T i, j;
+  T s;
+  static T a;
+// CHECK: static T a;
+  static T g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for schedule(dynamic) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for schedule(dynamic) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) ordered(N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+            foo();
+  // CHECK-NEXT: #pragma omp target parallel for private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) ordered(N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: foo();
+#pragma omp target parallel for default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(N) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:N][0:10])
+  for (int i = 0; i < 2; ++i) {}
+// CHECK-NEXT: #pragma omp target parallel for default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(N) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:N][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (N) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:N][:argc]) reduction(&& : h)
+// CHECK-NEXT: #pragma omp target parallel for if(N) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:N][:argc]) reduction(&&: h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (target:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(target: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (parallel:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(parallel: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (N)
+// CHECK-NEXT: #pragma omp target parallel for if(N)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(arr1[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: arr1[0:10],i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel for map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel for map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for nowait
+// CHECK-NEXT: #pragma omp target parallel for nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for depend(in : argc, arr[i:argc], arr1[:])
+// CHECK-NEXT: #pragma omp target parallel for depend(in : argc,arr[i:argc],arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel for defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int arr[5][10], arr1[5];
+  int i, j;
+  int s;
+  static int a;
+// CHECK: static int a;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for schedule(guided, argc) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for schedule(guided, argc) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) ordered if (target: argc) num_threads(a) default(shared) shared(e) reduction(+ : h) linear(a:-5)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+  // CHECK-NEXT: #pragma omp target parallel for private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) ordered if(target: argc) num_threads(a) default(shared) shared(e) reduction(+: h) linear(a: -5)
+ // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: foo();
+#pragma omp target parallel for default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(5) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:5][0:10])
+  for (int i = 0; i < 2; ++i) {}
+// CHECK-NEXT: #pragma omp target parallel for default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (5) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:5][:argc]) reduction(&& : h)
+// CHECK-NEXT: #pragma omp target parallel for if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (target:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(target: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (parallel:argc > 0)
+// CHECK-NEXT: #pragma omp target parallel for if(parallel: argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for if (5)
+// CHECK-NEXT: #pragma omp target parallel for if(5)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(arr1[0:10], i)
+// CHECK-NEXT: #pragma omp target parallel for map(tofrom: arr1[0:10],i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(to: i) map(from: j)
+// CHECK-NEXT: #pragma omp target parallel for map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for map(always,alloc: i)
+// CHECK-NEXT: #pragma omp target parallel for map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for nowait
+// CHECK-NEXT: #pragma omp target parallel for nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for depend(in : argc, arr[i:argc], arr1[:])
+// CHECK-NEXT: #pragma omp target parallel for depend(in : argc,arr[i:argc],arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+#pragma omp target parallel for defaultmap(tofrom: scalar)
+// CHECK-NEXT: #pragma omp target parallel for defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+  return (tmain<int, 5>(argc, &argc) + tmain<char, 1>(argv[0][0], argv[0]));
+}
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_collapse_messages.cpp b/test/OpenMP/target_parallel_for_collapse_messages.cpp
new file mode 100644
index 0000000000000..8cf502be0d9cc
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_collapse_messages.cpp
@@ -0,0 +1,83 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  #pragma omp target parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for', but found only 1}}
+  // expected-error@+3 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+  #pragma omp target parallel for collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+  #pragma omp target parallel for collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'collapse' clause}}
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  #pragma omp target parallel for collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_default_messages.cpp b/test/OpenMP/target_parallel_for_default_messages.cpp
new file mode 100644
index 0000000000000..c1f04f4f6e409
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_default_messages.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target parallel for default(shared), default(shared) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target parallel for default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_defaultmap_messages.cpp b/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
new file mode 100644
index 0000000000000..24973edd2bda8
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_defaultmap_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_depend_messages.cpp b/test/OpenMP/target_parallel_for_depend_messages.cpp
new file mode 100644
index 0000000000000..d0e74f9cc6295
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_depend_messages.cpp
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int i;
+
+  #pragma omp target parallel for depend // expected-error {{expected '(' after 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (out: ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : argv[ : argc][1 : argc - 1])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for depend(in : arr[0])
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_device_messages.cpp b/test/OpenMP/target_parallel_for_device_messages.cpp
new file mode 100644
index 0000000000000..16a21ba4d662d
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_device_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for device // expected-error {{expected '(' after 'device'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc + argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'device' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_firstprivate_messages.cpp b/test/OpenMP/target_parallel_for_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..36bfe25e94682
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_firstprivate_messages.cpp
@@ -0,0 +1,261 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target parallel for firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target parallel for firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_if_messages.cpp b/test/OpenMP/target_parallel_for_if_messages.cpp
new file mode 100644
index 0000000000000..01173c159d57f
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_if_messages.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_for_lastprivate_messages.cpp b/test/OpenMP/target_parallel_for_lastprivate_messages.cpp
new file mode 100644
index 0000000000000..c001b7f0d1685
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_lastprivate_messages.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for safelen(5) // expected-error {{unexpected OpenMP clause 'safelen' in directive '#pragma omp target parallel for'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(xa)
+#pragma omp target parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel reduction(+ : xa)
+#pragma omp target parallel for lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target parallel for lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 2;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_linear_messages.cpp b/test/OpenMP/target_parallel_for_linear_messages.cpp
new file mode 100644
index 0000000000000..36e897d5946e6
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_linear_messages.cpp
@@ -0,0 +1,269 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons() {
+  int B = 0;
+#pragma omp target parallel for linear(B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+#pragma omp target parallel for linear(B::ib : B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+#pragma omp target parallel for linear(B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+#pragma omp target parallel for linear(z : B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B : B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B, ::z, X::x)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for linear(B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for linear(B::ib, B : C1 + C2)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+
+template <int L, class T, class N>
+T test_template(T *arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = -num * L; // expected-note {{'ind2' defined here}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type}}
+#pragma omp target parallel for linear(ind2 : L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template <int LEN>
+int test_warn() {
+  int ind2 = 0;
+// expected-warning@+1 {{zero linear step (ind2 should probably be const)}}
+#pragma omp target parallel for linear(ind2 : LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc : 5)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for linear(a, b : B::ib)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for linear(v : i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target parallel for linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  int v = 0;
+#pragma omp target parallel for linear(v : j)
+  for (int k = 0; k < argc; ++k) {
+    ++k;
+    v += j;
+  }
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+#pragma omp target parallel for linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for linear(a, b)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{argument of a linear clause should be of integral or pointer type, not 'S4'}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+#pragma omp target parallel for linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for linear(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+#pragma omp target parallel for linear(i : 4)
+    for (int k = 0; k < argc; ++k) {
+      ++k;
+      i += 4;
+    }
+  }
+#pragma omp target parallel for linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_loop_messages.cpp b/test/OpenMP/target_parallel_for_loop_messages.cpp
new file mode 100644
index 0000000000000..0e8eab10b5cb5
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_loop_messages.cpp
@@ -0,0 +1,627 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target parallel for
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// Ok to skip parenthesises.
+#pragma omp target parallel for
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be firstprivate, predetermined as private}}
+#pragma omp target parallel for firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as linear}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be linear, predetermined as private}}
+#pragma omp target parallel for linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for' directive may not be threadprivate or thread local, predetermined as private}}
+#pragma omp target parallel for
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp target parallel for
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target parallel for collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for' must be a for loop}}
+#pragma omp target parallel for
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target parallel for
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  GoodIter &operator-=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target parallel for
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target parallel for
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp target parallel for
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target parallel for
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target parallel for
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try {
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i];
+      }
+      throw a[i];
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i];
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i];
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target parallel for
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target parallel for lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_map_messages.cpp b/test/OpenMP/target_parallel_for_map_messages.cpp
new file mode 100644
index 0000000000000..5223a2cc78ea8
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_map_messages.cpp
@@ -0,0 +1,273 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel for map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: t[:I])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(T) // expected-error {{'T' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel for map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
+{
+#pragma omp target parallel for map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) // expected-error 2 {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+
+#pragma omp target parallel for map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(argv[1])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel for map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel for map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(l) // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_messages.cpp b/test/OpenMP/target_parallel_for_messages.cpp
new file mode 100644
index 0000000000000..173025c01ce76
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_messages.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -std=c++11 -o - %s
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel for // expected-error {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for { // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for[ // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for] // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for } // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+    foo();
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for unknown()
+  for (int i = 0; i < argc; ++i)
+    foo();
+L1:
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i) {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch (argc) {
+    case (0):
+#pragma omp target parallel for
+      for (int i = 0; i < argc; ++i) {
+        foo();
+        break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+        continue;
+      }
+    default:
+      break;
+    }
+  }
+#pragma omp target parallel for default(none)
+  for (int i = 0; i < 10; ++i)
+    ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i)
+  L2:
+  foo();
+#pragma omp target parallel for
+  for (int i = 0; i < argc; ++i) {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+#pragma omp target parallel for
+      for (int n = 0; n < 100; ++n) {
+  }
+
+#pragma omp target parallel for copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel for'}}
+  for (int n = 0; n < 100; ++n) {}
+
+  return 0;
+}
+
+void test_ordered() {
+#pragma omp target parallel for ordered ordered // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/target_parallel_for_misc_messages.c b/test/OpenMP/target_parallel_for_misc_messages.c
new file mode 100644
index 0000000000000..cfe83f1301075
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_misc_messages.c
@@ -0,0 +1,314 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+#pragma omp target parallel for
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for'}}
+#pragma omp target parallel for foo
+
+void test_no_clause() {
+  int i;
+#pragma omp target parallel for
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for' must be a for loop}}
+#pragma omp target parallel for
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target parallel for
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for;
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+#pragma omp target parallel for, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+
+void test_collapse() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for collapse
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+1 {{defined as firstprivate}}
+#pragma omp target parallel for collapse(2) firstprivate(i)
+  for (i = 0; i < 16; ++i)
+// expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{region cannot be closely nested inside 'target parallel for' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_private() {
+  int i;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for private(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for private(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for private()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
diff --git a/test/OpenMP/target_parallel_for_nowait_messages.cpp b/test/OpenMP/target_parallel_for_nowait_messages.cpp
new file mode 100644
index 0000000000000..06d22969c4a4c
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_nowait_messages.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_num_threads_messages.cpp b/test/OpenMP/target_parallel_for_num_threads_messages.cpp
new file mode 100644
index 0000000000000..915cfb1dfc9e0
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_num_threads_messages.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+  #pragma omp target parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_ordered_messages.cpp b/test/OpenMP/target_parallel_for_ordered_messages.cpp
new file mode 100644
index 0000000000000..36eb8371e0252
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_ordered_messages.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}}
+#pragma omp target parallel for ordered
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered() // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for ordered(argc
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for', but found only 1}}
+// expected-error@+3 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(1)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(N) // expected-error {{argument to 'ordered' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for ordered
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered() // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}  expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp target parallel for', but found only 1}}
+#pragma omp target parallel for ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'ordered' clause}}
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for ordered(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for ordered(argv[1] = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target parallel for ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_private_messages.cpp b/test/OpenMP/target_parallel_for_private_messages.cpp
new file mode 100644
index 0000000000000..1d6381a1d5d3e
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_private_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target parallel for private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target parallel for private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target parallel for private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_proc_bind_messages.cpp b/test/OpenMP/target_parallel_for_proc_bind_messages.cpp
new file mode 100644
index 0000000000000..eeb232aeca811
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_proc_bind_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel for' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target parallel for proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_reduction_messages.cpp b/test/OpenMP/target_parallel_for_reduction_messages.cpp
new file mode 100644
index 0000000000000..16697a98733b4
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_reduction_messages.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel for reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target parallel for reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_schedule_messages.cpp b/test/OpenMP/target_parallel_for_schedule_messages.cpp
new file mode 100644
index 0000000000000..075e1df7827c5
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_schedule_messages.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp target parallel for schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp b/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp
new file mode 100644
index 0000000000000..669cafeae1ea3
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_aligned_messages.cpp
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 -verify -fopenmp %s
+
+struct B {
+  static int ib[20]; // expected-note 0 {{'B::ib' declared here}}
+  static constexpr int bfoo() { return 8; }
+};
+namespace X {
+  B x; // expected-note {{'x' defined here}}
+};
+constexpr int bfoo() { return 4; }
+
+int **z;
+const int C1 = 1;
+const int C2 = 2;
+void test_aligned_colons(int *&rp)
+{
+  int *B = 0;
+  #pragma omp target parallel for simd aligned(B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+  #pragma omp target parallel for simd aligned(B::ib:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+  #pragma omp target parallel for simd aligned(z:B:bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(B:B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'int **'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'B'}}
+  #pragma omp target parallel for simd aligned(X::x : ::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{integral constant expression must have integral or unscoped enumeration type, not 'B'}}
+  #pragma omp target parallel for simd aligned(B,rp,::z: X::x)
+  for (int i = 0; i < 10; ++i) ;
+  #pragma omp target parallel for simd aligned(::z)
+  for (int i = 0; i < 10; ++i) ;
+  // expected-error@+1 {{expected variable name}}
+  #pragma omp target parallel for simd aligned(B::bfoo())
+  for (int i = 0; i < 10; ++i) ;
+  // expected-warning@+1 {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  #pragma omp target parallel for simd aligned(B::ib,B:C1+C2)
+  for (int i = 0; i < 10; ++i) ;
+}
+
+// expected-note@+1 {{'num' defined here}}
+template<int L, class T, class N> T test_template(T* arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = - num * L;
+  // Negative number is passed as L.
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd aligned(arr:L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned(num:4)
+  for (i = 0; i < num; ++i);
+  return T();
+}
+
+template<int LEN> int test_warn() {
+  int *ind2 = 0;
+  // expected-error@+1 {{argument to 'aligned' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd aligned(ind2:LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return 0;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a; // expected-note {{'a' declared here}}
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+};
+const S2 b; // expected-note 1 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h; // expected-note 2 {{'h' defined here}}
+#pragma omp threadprivate(h)
+
+template<class I, class C> int foomain(I argc, C **argv) {
+  I e(argc);
+  I g(argc);
+  int i; // expected-note {{declared here}} expected-note {{'i' defined here}}
+  // expected-note@+2 {{declared here}}
+  // expected-note@+1 {{reference to 'i' is not a constant expression}}
+  int &j = i;
+  #pragma omp target parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned () // expected-error {{expected expression}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc : 5) // expected-warning {{aligned clause will be ignored because the requested alignment is not a power of 2}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned(e, g)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target parallel for simd aligned(h)
+  for (I k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned(i)
+  for (I k = 0; k < argc; ++k) ++k;
+  #pragma omp parallel
+  {
+    int *v = 0;
+    I i;
+    #pragma omp target parallel for simd aligned(v:16)
+    for (I k = 0; k < argc; ++k) { i = k; v += 2; }
+  }
+  float *f;
+  #pragma omp target parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  int v = 0;
+  // expected-note@+2 {{initializer of 'j' is not a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd aligned(f:j)
+  for (I k = 0; k < argc; ++k) { ++k; v += j; }
+  #pragma omp target parallel for simd aligned(f)
+  for (I k = 0; k < argc; ++k) ++k;
+  return 0;
+}
+
+// expected-note@+1 2 {{'argc' defined here}}
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  test_warn<4>(); // ok
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  int i;
+  int &j = i;
+  #pragma omp target parallel for simd aligned // expected-error {{expected '(' after 'aligned'}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned () // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'int'}}
+  #pragma omp target parallel for simd aligned (argc)
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+2 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S1'}}
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S2'}}
+  #pragma omp target parallel for simd aligned (a, b) 
+  for (int k = 0; k < argc; ++k) ++k;
+  #pragma omp target parallel for simd aligned (argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k) ++k;
+  // expected-error@+1 {{argument of aligned clause should be array, pointer, reference to array or reference to pointer, not 'S3'}}
+  #pragma omp target parallel for simd aligned(h)
+  for (int k = 0; k < argc; ++k) ++k;
+  int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
+  foomain<int*,char>(pargc,argv);
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_ast_print.cpp b/test/OpenMP/target_parallel_for_simd_ast_print.cpp
new file mode 100644
index 0000000000000..e25f93fbaefe9
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_ast_print.cpp
@@ -0,0 +1,308 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+struct S {
+  S(): a(0) {}
+  S(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(this->S::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a)
+
+class S8 : public S7<S> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S>(v){
+#pragma omp target parallel for simd private(a) private(this->a) private(S7<S>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a) private(this->S7<S>::a)
+// CHECK: #pragma omp target parallel for simd private(this->a) private(this->a)
+
+template <class T, int N>
+T tmain(T argc, T *argv) {
+  T b = argc, c, d, e, f, h;
+  T arr[N][10], arr1[N];
+  T i, j;
+  T s;
+  static T a;
+// CHECK: static T a;
+  static T g;
+  const T clen = 5;
+// CHECK: T clen = 5;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for simd schedule(dynamic) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for simd schedule(dynamic) default(none) linear(a)
+  for (T i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (T i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+#pragma omp target parallel for simd private(argc, b), firstprivate(c, d), lastprivate(d, f) collapse(N) schedule(static, N) ordered(N) if (parallel :argc) num_threads(N) default(shared) shared(e) reduction(+ : h)
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 2; ++j)
+      for (int j = 0; j < 2; ++j)
+        for (int j = 0; j < 2; ++j)
+          for (int j = 0; j < 2; ++j)
+            foo();
+  // CHECK-NEXT: #pragma omp target parallel for simd private(argc,b) firstprivate(c,d) lastprivate(d,f) collapse(N) schedule(static, N) ordered(N) if(parallel: argc) num_threads(N) default(shared) shared(e) reduction(+: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: for (int j = 0; j < 2; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target parallel for simd default(none), private(argc,b) firstprivate(argv) shared (d) if(parallel:argc > 0) num_threads(N) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:N][0:10])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(N) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:N][0:10])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(N) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:N][:argc]) reduction(&& : h)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(N) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:N][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(target:argc > 0)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(target: argc > 0)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(parallel:argc > 0)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(parallel: argc > 0)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if(N)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(N)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(arr1[0:10], i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(to: i) map(from: j)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(always,alloc: i)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(always,alloc: i)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd nowait
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd nowait
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd depend(in : argc, arr[i:argc], arr1[:])
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd safelen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd safelen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd simdlen(clen-1)
+  for (T i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd simdlen(clen - 1)
+  // CHECK-NEXT: for (T i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd aligned(arr1:N-1)
+  for (T i = 0; i < N; ++i) {}
+  // CHECK: #pragma omp target parallel for simd aligned(arr1: N - 1)
+  // CHECK-NEXT: for (T i = 0; i < N; ++i) {
+  // CHECK-NEXT: }
+
+  return T();
+}
+
+int main(int argc, char **argv) {
+  int b = argc, c, d, e, f, h;
+  int arr[5][10], arr1[5];
+  int i, j;
+  int s;
+  static int a;
+// CHECK: static int a;
+  const int clen = 5;
+// CHECK: int clen = 5;
+  static float g;
+#pragma omp threadprivate(g)
+#pragma omp target parallel for simd schedule(guided, argc) default(none) linear(a)
+  // CHECK: #pragma omp target parallel for simd schedule(guided, argc) default(none) linear(a)
+  for (int i = 0; i < 2; ++i)
+    a = 2;
+// CHECK-NEXT: for (int i = 0; i < 2; ++i)
+// CHECK-NEXT: a = 2;
+
+#pragma omp target parallel for simd private(argc, b), firstprivate(argv, c), lastprivate(d, f) collapse(2) schedule(auto) ordered if (target: argc) num_threads(a) default(shared) shared(e) reduction(+ : h) linear(a:-5)
+  for (int i = 0; i < 10; ++i)
+    for (int j = 0; j < 10; ++j)
+      foo();
+  // CHECK: #pragma omp target parallel for simd private(argc,b) firstprivate(argv,c) lastprivate(d,f) collapse(2) schedule(auto) ordered if(target: argc) num_threads(a) default(shared) shared(e) reduction(+: h) linear(a: -5)
+  // CHECK-NEXT: for (int i = 0; i < 10; ++i)
+  // CHECK-NEXT: for (int j = 0; j < 10; ++j)
+  // CHECK-NEXT: foo();
+
+#pragma omp target parallel for simd default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(5) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:5][0:10])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (5) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:5][:argc]) reduction(&& : h)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: h)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (target:argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(target: argc > 0)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (parallel:argc > 0)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(parallel: argc > 0)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd if (5)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd if(5)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd  map(tofrom: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(arr1[0:10], i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(tofrom: arr1[0:10],i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(to: i) map(from: j)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(to: i) map(from: j)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd map(always,alloc: i)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd map(always,alloc: i)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd nowait
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd nowait
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd depend(in : argc, arr[i:argc], arr1[:])
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd depend(in : argc,arr[i:argc],arr1[:])
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd defaultmap(tofrom: scalar)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd safelen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd safelen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd simdlen(clen-1)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd simdlen(clen - 1)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+#pragma omp target parallel for simd aligned(arr1:4)
+  for (int i = 0; i < 2; ++i) {}
+  // CHECK: #pragma omp target parallel for simd aligned(arr1: 4)
+  // CHECK-NEXT: for (int i = 0; i < 2; ++i) {
+  // CHECK-NEXT: }
+
+  return (tmain<int, 5>(argc, &argc));
+}
+
+#endif
diff --git a/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp b/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp
new file mode 100644
index 0000000000000..ecf2d6e7889e9
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_collapse_messages.cpp
@@ -0,0 +1,101 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+ // expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  int j; // expected-note {{declared here}}
+  #pragma omp target parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for simd collapse (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd collapse (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (N) // expected-error {{argument to 'collapse' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int j; // expected-note {{declared here}}
+  #pragma omp target parallel for simd collapse // expected-error {{expected '(' after 'collapse'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse (4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+  #pragma omp target parallel for simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif 
+  #pragma omp target parallel for simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus >= 201103L
+  // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd collapse (foobool(argc)), collapse (true), collapse (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+  // expected-error@+1 {{expression is not an integral constant expression}}
+  #pragma omp target parallel for simd collapse (j=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd collapse(collapse(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  #pragma omp target parallel for simd collapse (2) // expected-note {{as specified in 'collapse' clause}}
+  foo(); // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_default_messages.cpp b/test/OpenMP/target_parallel_for_simd_default_messages.cpp
new file mode 100644
index 0000000000000..5d41cbcb457e1
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_default_messages.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for simd default // expected-error {{expected '(' after 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default( // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default() // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default(none // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+#pragma omp target parallel for simd default(shared), default(shared) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'default' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd default(x) // expected-error {{expected 'none' or 'shared' in OpenMP clause 'default'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for simd default(none)
+  for (i = 0; i < argc; ++i)  // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+    foo();
+
+#pragma omp parallel default(none)
+#pragma omp target parallel for simd default(shared)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp b/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
new file mode 100644
index 0000000000000..e922e0a000b45
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_defaultmap_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+template <class T, typename S, int N, int ST>
+T tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd defaultmap // expected-error {{expected '(' after 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap ( // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap () // expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom: // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom) // expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom scalar) // expected-warning {{missing ':' after defaultmap modifier - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, // expected-error {{expected ')'}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (scalar: // expected-error {{expected ')'}} expected-error {{expected 'tofrom' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd defaultmap (tofrom, scalar // expected-error {{expected ')'}} expected-warning {{missing ':' after defaultmap modifier - ignoring}} expected-error {{expected 'scalar' in OpenMP clause 'defaultmap'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_depend_messages.cpp b/test/OpenMP/target_parallel_for_simd_depend_messages.cpp
new file mode 100644
index 0000000000000..0504cabd0f017
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_depend_messages.cpp
@@ -0,0 +1,90 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int i;
+
+  #pragma omp target parallel for simd depend // expected-error {{expected '(' after 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend ( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend () // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (out: ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (out :S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[0])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : ) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : main) // expected-error {{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend (in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : argv[ : argc][1 : argc - 1])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd depend(in : arr[0])
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_device_messages.cpp b/test/OpenMP/target_parallel_for_simd_device_messages.cpp
new file mode 100644
index 0000000000000..2c9d43f07b257
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_device_messages.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd device // expected-error {{expected '(' after 'device'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc + argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (argc), device (argc+1) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'device' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp b/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp
new file mode 100644
index 0000000000000..9397314d87cc6
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_firstprivate_messages.cpp
@@ -0,0 +1,261 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(const S2 &s2) : a(s2.a) {}
+  static float S2s;
+  static const float S2sc;
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3);
+
+public:
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5(const S5 &s5) : a(s5.a) {} // expected-note 4 {{implicitly declared private here}}
+
+public:
+  S5() : a(0) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  C g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(a, b) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(h) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd firstprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd firstprivate // expected-error {{expected '(' after 'firstprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(a, b, c, d, f) // expected-error {{firstprivate variable with incomplete type 'S1'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(ba) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(ca) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(da) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for simd firstprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S2::S2s) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(S2::S2sc) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(m) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be firstprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd private(xa), firstprivate(xa) // expected-error {{private variable cannot be firstprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i)    // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel shared(xa)
+#pragma omp target parallel for simd firstprivate(xa) // OK: may be firstprivate
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(g) firstprivate(g) // expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd firstprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+#pragma omp parallel reduction(+ : i)
+#pragma omp target parallel for simd firstprivate(i) // expected-note {{defined as firstprivate}}
+  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+    foo();
+  static int si;
+#pragma omp target parallel for simd firstprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 1;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_if_messages.cpp b/test/OpenMP/target_parallel_for_simd_if_messages.cpp
new file mode 100644
index 0000000000000..b9e2891575b24
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_if_messages.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int i;
+  #pragma omp target parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd if // expected-error {{expected '(' after 'if'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc > 0 ? argv[1] : argv[2])
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel for simd'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'target' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp b/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp
new file mode 100644
index 0000000000000..51fc72464c9ab
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_lastprivate_messages.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  S2 &operator=(const S2 &);
+  const S2 &operator=(const S2 &) const;
+  static float S2s; // expected-note {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note {{static data member is predetermined as shared}}
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+  S3 &operator=(const S3 &s3); // expected-note 2 {{implicitly declared private here}}
+
+public:
+  S3() : a(0) {}
+  S3(S3 &s3) : a(s3.a) {}
+};
+const S3 c;         // expected-note {{global variable is predetermined as shared}}
+const S3 ca[5];     // expected-note {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}}
+class S4 {
+  int a;
+  S4();             // expected-note 3 {{implicitly declared private here}}
+  S4(const S4 &s4);
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(const S5 &s5) : a(s5.a) {}
+  S5(int v) : a(v) {}
+};
+class S6 {
+  int a;
+  S6() : a(0) {}
+
+public:
+  S6(const S6 &s6) : a(s6.a) {}
+  S6(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(int argc, char **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(a, b) // expected-error {{lastprivate variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(e, g) // expected-error 2 {{calling a private constructor of class 'S4'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(h) // expected-error {{threadprivate or thread local variable cannot be lastprivate}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd lastprivate(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd lastprivate(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd lastprivate(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = {0}; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g(5);
+  S3 m;
+  S6 n(2);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd lastprivate // expected-error {{expected '(' after 'lastprivate'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argc)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(a, b, c, d, f) // expected-error {{lastprivate variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(argv[1]) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(2 * 2) // expected-error {{expected variable name}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(ba)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(ca) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(da) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+  int xa;
+#pragma omp target parallel for simd lastprivate(xa) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S2::S2s) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(S2::S2sc) // expected-error {{shared variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd safelen(5) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be lastprivate}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd private(xa), lastprivate(xa) // expected-error {{private variable cannot be lastprivate}} expected-note {{defined as private}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(i)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel private(xa)
+#pragma omp target parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp parallel reduction(+ : xa)
+#pragma omp target parallel for simd lastprivate(xa)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(j)
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd firstprivate(m) lastprivate(m) // expected-error {{'operator=' is a private member of 'S3'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd lastprivate(n) firstprivate(n) // OK
+  for (i = 0; i < argc; ++i)
+    foo();
+  static int si;
+#pragma omp target parallel for simd lastprivate(si) // OK
+  for (i = 0; i < argc; ++i)
+    si = i + 2;
+
+  return foomain<S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<S4, S5>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_linear_messages.cpp b/test/OpenMP/target_parallel_for_simd_linear_messages.cpp
new file mode 100644
index 0000000000000..e17f15550e653
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_linear_messages.cpp
@@ -0,0 +1,269 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+namespace X {
+int x;
+};
+
+struct B {
+  static int ib; // expected-note {{'B::ib' declared here}}
+  static int bfoo() { return 8; }
+};
+
+int bfoo() { return 4; }
+
+int z;
+const int C1 = 1;
+const int C2 = 2;
+void test_linear_colons() {
+  int B = 0;
+#pragma omp target parallel for simd linear(B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'}}
+#pragma omp target parallel for simd linear(B::ib : B : bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{use of undeclared identifier 'ib'; did you mean 'B::ib'}}
+#pragma omp target parallel for simd linear(B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{unexpected ':' in nested name specifier; did you mean '::'?}}
+#pragma omp target parallel for simd linear(z : B : ib)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B : B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(X::x : ::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B, ::z, X::x)
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(::z)
+  for (int i = 0; i < 10; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd linear(B::bfoo())
+  for (int i = 0; i < 10; ++i)
+    ;
+#pragma omp target parallel for simd linear(B::ib, B : C1 + C2)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+
+template <int L, class T, class N>
+T test_template(T *arr, N num) {
+  N i;
+  T sum = (T)0;
+  T ind2 = -num * L; // expected-note {{'ind2' defined here}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type}}
+#pragma omp target parallel for simd linear(ind2 : L)
+  for (i = 0; i < num; ++i) {
+    T cur = arr[(int)ind2];
+    ind2 += L;
+    sum += cur;
+  }
+  return T();
+}
+
+template <int LEN>
+int test_warn() {
+  int ind2 = 0;
+// expected-warning@+1 {{zero linear step (ind2 should probably be const)}}
+#pragma omp target parallel for simd linear(ind2 : LEN)
+  for (int i = 0; i < 100; i++) {
+    ind2 += LEN;
+  }
+  return ind2;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b; // expected-note 2 {{'b' defined here}}
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4();
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {}
+
+public:
+  S5(int v) : a(v) {}
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc : 5)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for simd linear(a, b : B::ib)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(h) // expected-error {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd linear(v : i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp target parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  int v = 0;
+#pragma omp target parallel for simd linear(v : j)
+  for (int k = 0; k < argc; ++k) {
+    ++k;
+    v += j;
+  }
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace C {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  double darr[100];
+  // expected-note@+1 {{in instantiation of function template specialization 'test_template<-4, double, int>' requested here}}
+  test_template<-4>(darr, 4);
+  // expected-note@+1 {{in instantiation of function template specialization 'test_warn<0>' requested here}}
+  test_warn<0>();
+
+  S4 e(4); // expected-note {{'e' defined here}}
+  S5 g(5); // expected-note {{'g' defined here}}
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd linear // expected-error {{expected '(' after 'linear'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{linear variable with incomplete type 'S1'}}
+// expected-error@+1 {{const-qualified variable cannot be linear}}
+#pragma omp target parallel for simd linear(a, b)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+// expected-error@+2 {{argument of a linear clause should be of integral or pointer type, not 'S4'}}
+// expected-error@+1 {{argument of a linear clause should be of integral or pointer type, not 'S5'}}
+#pragma omp target parallel for simd linear(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(h, C::x) // expected-error 2 {{threadprivate or thread local variable cannot be linear}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for simd linear(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+#pragma omp target parallel for simd linear(i : 4)
+    for (int k = 0; k < argc; ++k) {
+      ++k;
+      i += 4;
+    }
+  }
+#pragma omp target parallel for simd linear(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd linear(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+
+  foomain<int, char>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_loop_messages.cpp b/test/OpenMP/target_parallel_for_simd_loop_messages.cpp
new file mode 100644
index 0000000000000..c0dceeded4540
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_loop_messages.cpp
@@ -0,0 +1,627 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -x c++ -std=c++11 -fexceptions -fcxx-exceptions -verify %s
+
+class S {
+  int a;
+  S() : a(0) {}
+
+public:
+  S(int v) : a(v) {}
+  S(const S &s) : a(s.a) {}
+};
+
+static int sii;
+// expected-note@+1 {{defined as threadprivate or thread local}}
+#pragma omp threadprivate(sii)
+static int globalii;
+
+int test_iteration_spaces() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+  int ii, jj, kk;
+  float fii;
+  double dii;
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i += 1) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (char i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (char i = 0; i < 10; i += '\1') {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 10; i += 1.5) {
+    c[i] = a[i] + b[i];
+  }
+#pragma omp target parallel for simd
+  for (long long i = 0; i < 'z'; i += 1u) {
+    c[i] = a[i] + b[i];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or random access iterator type}}
+#pragma omp target parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int &ref = ii; ref < 10; ref++) {
+  }
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int i; i < 10; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int i = 0, j = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (ii + 1; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (c[ii] = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// Ok to skip parenthesises.
+#pragma omp target parallel for simd
+  for (((ii)) = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i; i++)
+    c[i] = a[i];
+
+// expected-error@+3 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; jj < kk; ii++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; !!i; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i != 1; i++)
+    c[i] = a[i];
+
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'i'}}
+#pragma omp target parallel for simd
+  for (int i = 0;; i++)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (int i = 11; i > 10; i--)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; ++i)
+    c[i] = a[i];
+
+// Ok.
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++jj)
+    c[ii] = a[jj];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ++++ii)
+    c[ii] = a[ii];
+
+// Ok but undefined behavior (in general, cannot check that incr
+// is really loop-invariant).
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{expression must have integral or unscoped enumeration type, not 'float'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 1.0f)
+    c[ii] = a[ii];
+
+// Ok - step was converted to integer type.
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)1.1f)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; jj = ii + 2)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{relational comparison result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii<10; jj> kk + 2)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10;)
+    c[ii] = a[ii];
+
+// expected-warning@+3 {{expression result unused}}
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; !ii)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii ? ++ii : ++jj)
+    c[ii] = a[ii];
+
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'ii'}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii < 10)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; ii = ii + (int)(0.8 - 0.45))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii) < 10; ii -= 25)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii < 10); ii -= 0)
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii > 10; (ii += 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; ii < 10; (ii) = (1 - 1) + (ii))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for ((ii = 0); ii > 10; (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'ii' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (ii = 0; (ii < 10); (ii -= 0))
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as firstprivate}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be firstprivate, predetermined as private}}
+#pragma omp target parallel for simd firstprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+// expected-note@+2  {{defined as linear}}
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be linear, predetermined as private}}
+#pragma omp target parallel for simd linear(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for simd private(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+#pragma omp target parallel for simd lastprivate(ii)
+  for (ii = 0; ii < 10; ii++)
+    c[ii] = a[ii];
+
+  {
+// expected-error@+2 {{loop iteration variable in the associated loop of 'omp target parallel for simd' directive may not be threadprivate or thread local, predetermined as private}}
+#pragma omp target parallel for simd
+    for (sii = 0; sii < 10; sii += 1)
+      c[sii] = a[sii];
+  }
+
+  {
+#pragma omp target parallel for simd
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] = a[globalii];
+  }
+
+  {
+#pragma omp target parallel for simd collapse(2)
+    for (ii = 0; ii < 10; ii += 1)
+    for (globalii = 0; globalii < 10; globalii += 1)
+      c[globalii] += a[globalii] + ii;
+  }
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+#pragma omp target parallel for simd
+  for (auto &item : a) {
+    item = item + 1;
+  }
+
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'i' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (unsigned i = 9; i < 10; i--) {
+    c[i] = a[i] + b[i];
+  }
+
+  int(*lb)[4] = nullptr;
+#pragma omp target parallel for simd
+  for (int(*p)[4] = lb; p < lb + 8; ++p) {
+  }
+
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (int a{0}; a < 10; ++a) {
+  }
+
+  return 0;
+}
+
+// Iterators allowed in openmp for-loops.
+namespace std {
+struct random_access_iterator_tag {};
+template <class Iter>
+struct iterator_traits {
+  typedef typename Iter::difference_type difference_type;
+  typedef typename Iter::iterator_category iterator_category;
+};
+template <class Iter>
+typename iterator_traits<Iter>::difference_type
+distance(Iter first, Iter last) { return first - last; }
+}
+class Iter0 {
+public:
+  Iter0() {}
+  Iter0(const Iter0 &) {}
+  Iter0 operator++() { return *this; }
+  Iter0 operator--() { return *this; }
+  bool operator<(Iter0 a) { return true; }
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'Iter0' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'Iter0' for 1st argument}}
+int operator-(Iter0 a, Iter0 b) { return 0; }
+class Iter1 {
+public:
+  Iter1(float f = 0.0f, double d = 0.0) {}
+  Iter1(const Iter1 &) {}
+  Iter1 operator++() { return *this; }
+  Iter1 operator--() { return *this; }
+  bool operator<(Iter1 a) { return true; }
+  bool operator>=(Iter1 a) { return false; }
+};
+class GoodIter {
+public:
+  GoodIter() {}
+  GoodIter(const GoodIter &) {}
+  GoodIter(int fst, int snd) {}
+  GoodIter &operator=(const GoodIter &that) { return *this; }
+  GoodIter &operator=(const Iter0 &that) { return *this; }
+  GoodIter &operator+=(int x) { return *this; }
+  GoodIter &operator-=(int x) { return *this; }
+  explicit GoodIter(void *) {}
+  GoodIter operator++() { return *this; }
+  GoodIter operator--() { return *this; }
+  bool operator!() { return true; }
+  bool operator<(GoodIter a) { return true; }
+  bool operator<=(GoodIter a) { return true; }
+  bool operator>=(GoodIter a) { return false; }
+  typedef int difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+};
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+int operator-(GoodIter a, GoodIter b) { return 0; }
+// expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
+GoodIter operator-(GoodIter a) { return a; }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
+GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
+GoodIter operator+(GoodIter a, int v) { return GoodIter(); }
+// expected-note@+2 {{candidate function not viable: no known conversion from 'GoodIter' to 'int' for 1st argument}}
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'int' for 1st argument}}
+GoodIter operator-(int v, GoodIter a) { return GoodIter(); }
+// expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 1st argument}}
+GoodIter operator+(int v, GoodIter a) { return GoodIter(); }
+
+int test_with_random_access_iterator() {
+  GoodIter begin, end;
+  Iter0 begin0, end0;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I < end; ++I)
+    ++I;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter &I = begin; I < end; ++I)
+    ++I;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; --I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(begin); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(nullptr); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(0); I < end; ++I)
+    ++I;
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (GoodIter I(1, 2); I < end; ++I)
+    ++I;
+#pragma omp target parallel for simd
+  for (begin = GoodIter(0); begin < end; ++begin)
+    ++begin;
+// expected-error@+3 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for simd
+  for (begin = begin0; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (++begin; begin < end; ++begin)
+    ++begin;
+#pragma omp target parallel for simd
+  for (begin = end; begin < end; ++begin)
+    ++begin;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I - I; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; begin < end; ++I)
+    ++I;
+// expected-error@+2 {{condition of OpenMP for loop must be a relational comparison ('<', '<=', '>', or '>=') of loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; !I; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = I + 1)
+    ++I;
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = I - 1)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = -I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 + I)
+    ++I;
+// expected-error@+2 {{increment clause of OpenMP for loop must perform simple addition or subtraction on loop variable 'I'}}
+#pragma omp target parallel for simd
+  for (GoodIter I = begin; I >= end; I = 2 - I)
+    ++I;
+// expected-error@+2 {{invalid operands to binary expression ('Iter0' and 'int')}}
+#pragma omp target parallel for simd
+  for (Iter0 I = begin0; I < end0; ++I)
+    ++I;
+// Initializer is constructor without params.
+// expected-error@+3 {{invalid operands to binary expression ('Iter0' and 'int')}}
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (Iter0 I; I < end0; ++I)
+    ++I;
+  Iter1 begin1, end1;
+// expected-error@+3 {{invalid operands to binary expression ('Iter1' and 'Iter1')}}
+// expected-error@+2 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+#pragma omp target parallel for simd
+  for (Iter1 I = begin1; I < end1; ++I)
+    ++I;
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (Iter1 I = begin1; I >= end1; ++I)
+    ++I;
+// expected-error@+5 {{invalid operands to binary expression ('Iter1' and 'float')}}
+// expected-error@+4 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
+// Initializer is constructor with all default params.
+// expected-warning@+2 {{initialization clause of OpenMP for loop is not in canonical form ('var = init' or 'T var = init')}}
+#pragma omp target parallel for simd
+  for (Iter1 I; I < end1; ++I) {
+  }
+  return 0;
+}
+
+template <typename IT, int ST>
+class TC {
+public:
+  int dotest_lt(IT begin, IT end) {
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+    for (IT I = begin; I < end; I = I + ST) {
+      ++I;
+    }
+// expected-note@+3 {{loop step is expected to be positive due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to increase on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+    for (IT I = begin; I <= end; I += ST) {
+      ++I;
+    }
+#pragma omp target parallel for simd
+    for (IT I = begin; I < end; ++I) {
+      ++I;
+    }
+  }
+
+  static IT step() {
+    return IT(ST);
+  }
+};
+template <typename IT, int ST = 0>
+int dotest_gt(IT begin, IT end) {
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; I = I + ST) {
+    ++I;
+  }
+// expected-note@+3 2 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; I += ST) {
+    ++I;
+  }
+
+// expected-note@+3 {{loop step is expected to be negative due to this condition}}
+// expected-error@+2 {{increment expression must cause 'I' to decrease on each iteration of OpenMP for loop}}
+#pragma omp target parallel for simd
+  for (IT I = begin; I >= end; ++I) {
+    ++I;
+  }
+
+#pragma omp target parallel for simd
+  for (IT I = begin; I < end; I += TC<int, ST>::step()) {
+    ++I;
+  }
+}
+
+void test_with_template() {
+  GoodIter begin, end;
+  TC<GoodIter, 100> t1;
+  TC<GoodIter, -100> t2;
+  t1.dotest_lt(begin, end);
+  t2.dotest_lt(begin, end);         // expected-note {{in instantiation of member function 'TC<GoodIter, -100>::dotest_lt' requested here}}
+  dotest_gt(begin, end);            // expected-note {{in instantiation of function template specialization 'dotest_gt<GoodIter, 0>' requested here}}
+  dotest_gt<unsigned, -10>(0, 100); // expected-note {{in instantiation of function template specialization 'dotest_gt<unsigned int, -10>' requested here}}
+}
+
+void test_loop_break() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    for (int j = 0; j < 10; ++j) {
+      if (a[i] > b[j])
+        break; // OK in nested loop
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    if (c[i] > 10)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+
+    if (c[i] > 11)
+      break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+  }
+
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    for (int j = 0; j < 10; j++) {
+      c[i] = a[i] + b[i];
+      if (c[i] > 10) {
+        if (c[i] < 20) {
+          break; // OK
+        }
+      }
+    }
+  }
+}
+
+void test_loop_eh() {
+  const int N = 100;
+  float a[N], b[N], c[N];
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; i++) {
+    c[i] = a[i] + b[i];
+    try {
+      for (int j = 0; j < 10; ++j) {
+        if (a[i] > b[j])
+          throw a[i];
+      }
+      throw a[i];
+    } catch (float f) {
+      if (f > 0.1)
+        throw a[i];
+      return; // expected-error {{cannot return from OpenMP region}}
+    }
+    switch (i) {
+    case 1:
+      b[i]++;
+      break;
+    default:
+      break;
+    }
+    for (int j = 0; j < 10; j++) {
+      if (c[i] > 10)
+        throw c[i];
+    }
+  }
+  if (c[9] > 10)
+    throw c[9]; // OK
+
+#pragma omp target parallel for simd
+  for (int i = 0; i < 10; ++i) {
+    struct S {
+      void g() { throw 0; }
+    };
+  }
+}
+
+void test_loop_firstprivate_lastprivate() {
+  S s(4);
+#pragma omp target parallel for simd lastprivate(s) firstprivate(s)
+  for (int i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_map_messages.cpp b/test/OpenMP/target_parallel_for_simd_map_messages.cpp
new file mode 100644
index 0000000000000..f44639e86cab2
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_map_messages.cpp
@@ -0,0 +1,273 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel for simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: t[:I])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(T) // expected-error {{'T' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel for simd map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
+{
+#pragma omp target parallel for simd map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) // expected-error 2 {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+
+#pragma omp target parallel for simd map // expected-error {{expected '(' after 'map'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map() // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to:) // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to: to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(argv[1])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(ca)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2s)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(S2::S2sc)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(e, g)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(da[:4])
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel for simd map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel for simd map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(j)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(l) // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  for (i = 0; i < argc; ++i) foo();
+}
+
+#pragma omp target parallel for simd map(always, tofrom: x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always: x) // expected-error {{missing map type}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(always, tofrom: always, tofrom, x)
+  for (i = 0; i < argc; ++i) foo();
+#pragma omp target parallel for simd map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_messages.cpp b/test/OpenMP/target_parallel_for_simd_messages.cpp
new file mode 100644
index 0000000000000..0e1a0fea7e619
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_messages.cpp
@@ -0,0 +1,92 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel for simd // expected-error {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for simd { // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd[ // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd] // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd } // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd unknown()
+  for (int i = 0; i < argc; ++i)
+    foo();
+L1:
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i) {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch (argc) {
+    case (0):
+#pragma omp target parallel for simd
+      for (int i = 0; i < argc; ++i) {
+        foo();
+        break; // expected-error {{'break' statement cannot be used in OpenMP for loop}}
+        continue;
+      }
+    default:
+      break;
+    }
+  }
+#pragma omp target parallel for simd default(none)
+  for (int i = 0; i < 10; ++i)
+    ++argc; // expected-error {{variable 'argc' must have explicitly specified data sharing attributes}}
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i)
+  L2:
+  foo();
+#pragma omp target parallel for simd
+  for (int i = 0; i < argc; ++i) {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+#pragma omp target parallel for simd
+      for (int n = 0; n < 100; ++n) {
+  }
+
+#pragma omp target parallel for simd copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel for simd'}}
+  for (int n = 0; n < 100; ++n) {}
+
+  return 0;
+}
+
+void test_ordered() {
+#pragma omp target parallel for simd ordered ordered // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+  for (int i = 0; i < 16; ++i)
+    ;
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_misc_messages.c b/test/OpenMP/target_parallel_for_simd_misc_messages.c
new file mode 100644
index 0000000000000..2adc6f85116e6
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_misc_messages.c
@@ -0,0 +1,495 @@
+// RUN: %clang_cc1 -fsyntax-only -fopenmp -verify %s
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+#pragma omp target parallel for simd
+
+// expected-error@+1 {{unexpected OpenMP directive '#pragma omp target parallel for simd'}}
+#pragma omp target parallel for simd foo
+
+void test_no_clause() {
+  int i;
+#pragma omp target parallel for simd
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+#pragma omp target parallel for simd
+  ++i;
+}
+
+void test_branch_protected_scope() {
+  int i = 0;
+L1:
+  ++i;
+
+  int x[24];
+
+#pragma omp target parallel for simd
+  for (i = 0; i < 16; ++i) {
+    if (i == 5)
+      goto L1; // expected-error {{use of undeclared label 'L1'}}
+    else if (i == 6)
+      return; // expected-error {{cannot return from OpenMP region}}
+    else if (i == 7)
+      goto L2;
+    else if (i == 8) {
+    L2:
+      x[i]++;
+    }
+  }
+
+  if (x[0] == 0)
+    goto L2; // expected-error {{use of undeclared label 'L2'}}
+  else if (x[1] == 1)
+    goto L1;
+}
+
+void test_invalid_clause() {
+  int i;
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd foo bar
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_non_identifiers() {
+  int i, x;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd;
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+#pragma omp target parallel for simd, private(x);
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+extern int foo();
+
+void test_collapse() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd collapse
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd collapse()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd collapse(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd collapse 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4,
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, )
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#pragma omp target parallel for simd collapse(4)
+  for (int i1 = 0; i1 < 16; ++i1)
+    for (int i2 = 0; i2 < 16; ++i2)
+      for (int i3 = 0; i3 < 16; ++i3)
+        for (int i4 = 0; i4 < 16; ++i4)
+          foo();
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}} expected-note@+1 {{as specified in 'collapse' clause}}
+#pragma omp target parallel for simd collapse(4, 8)
+  for (i = 0; i < 16; ++i)
+    ; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd collapse(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd collapse(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'collapse' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd collapse(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-note@+1 {{defined as firstprivate}}
+#pragma omp target parallel for simd collapse(2) firstprivate(i)
+  for (i = 0; i < 16; ++i)
+// expected-note@+1 {{variable with automatic storage duration is predetermined as private; perhaps you forget to enclose 'omp for' directive into a parallel or another task region?}}
+    for (int j = 0; j < 16; ++j)
+// expected-error@+2 2 {{reduction variable must be shared}}
+// expected-error@+1 {{region cannot be closely nested inside 'target parallel for simd' region; perhaps you forget to enclose 'omp for' directive into a parallel region?}}
+#pragma omp for reduction(+ : i, j)
+      for (int k = 0; k < 16; ++k)
+        i += j;
+}
+
+void test_private() {
+  int i;
+// expected-error@+2 {{expected expression}}
+// expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd private(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd private(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd private(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd private()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd private(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd private(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd private(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd private(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd private(x, y, z)
+  for (i = 0; i < 16; ++i) {
+    x = y * i + z;
+  }
+}
+
+void test_lastprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd lastprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd lastprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd lastprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_firstprivate() {
+  int i;
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(
+  for (i = 0; i < 16; ++i)
+    ;
+
+// expected-error@+2 {{expected ')'}} expected-note@+2 {{to match this '('}}
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 2 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd firstprivate(int)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected variable name}}
+#pragma omp target parallel for simd firstprivate(0)
+  for (i = 0; i < 16; ++i)
+    ;
+
+  int x, y, z;
+#pragma omp target parallel for simd lastprivate(x) firstprivate(x)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y) firstprivate(x, y)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd lastprivate(x, y, z) firstprivate(x, y, z)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_loop_messages() {
+  float a[100], b[100], c[100];
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for simd
+  for (float fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+// expected-error@+2 {{variable must be of integer or pointer type}}
+#pragma omp target parallel for simd
+  for (double fi = 0; fi < 10.0; fi++) {
+    c[(int)fi] = a[(int)fi] + b[(int)fi];
+  }
+}
+
+void test_safelen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd safelen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd safelen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd safelen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd safelen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd safelen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd safelen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd safelen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd safelen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_simdlen() {
+  int i;
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd simdlen
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}
+#pragma omp target parallel for simd simdlen()
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}} expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expected expression}}  expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(, )
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-warning@+2 {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+// expected-error@+1 {{expected '('}}
+#pragma omp target parallel for simd simdlen 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4,
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, )
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4 4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, , 4)
+  for (i = 0; i < 16; ++i)
+    ;
+#pragma omp target parallel for simd simdlen(4)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+2 {{expected ')'}}
+// expected-note@+1 {{to match this '('}}
+#pragma omp target parallel for simd simdlen(4, 8)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd simdlen(2.5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{expression is not an integer constant expression}}
+#pragma omp target parallel for simd simdlen(foo())
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(-5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(0)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen(5 - 5)
+  for (i = 0; i < 16; ++i)
+    ;
+}
+
+void test_safelen_simdlen() {
+  int i;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target parallel for simd simdlen(6) safelen(5)
+  for (i = 0; i < 16; ++i)
+    ;
+// expected-error@+1 {{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+#pragma omp target parallel for simd safelen(5) simdlen(6)
+  for (i = 0; i < 16; ++i)
+    ;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp b/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp
new file mode 100644
index 0000000000000..3c4b512048177
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_nowait_messages.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait device (-10u)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp b/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp
new file mode 100644
index 0000000000000..1076b86ea5e67
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_num_threads_messages.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  T i;
+  #pragma omp target parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (S) // expected-error {{'S' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int i;
+  #pragma omp target parallel for simd num_threads // expected-error {{expected '(' after 'num_threads'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads () // expected-error {{expected expression}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  for (i = 0; i < argc; ++i) foo();
+  #pragma omp target parallel for simd num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  for (i = 0; i < argc; ++i) foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp b/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
new file mode 100644
index 0000000000000..70a3b4ef62454
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_ordered_messages.cpp
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+ // expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {                   //expected-note 2 {{declared here}}
+  int j; // expected-note {{declared here}}
+#pragma omp target parallel for simd ordered
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for simd ordered(argc
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd ordered(ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'ordered' clause}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+5 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+3 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+// expected-error@+2 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+// expected-error@+1 2 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+// expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(1)
+  for (int i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(N) // expected-error {{argument to 'ordered' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - ST];
+#pragma omp target parallel for simd ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  int j; // expected-note {{declared here}}
+#pragma omp target parallel for simd ordered
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered() // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered(4 // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4]; // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#pragma omp target parallel for simd ordered(2 + 2))      // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}  expected-note {{as specified in 'ordered' clause}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];            // expected-error {{expected 4 for loops after '#pragma omp target parallel for simd', but found only 1}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd ordered(foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#if __cplusplus >= 201103L
+  // expected-note@+5 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+3 {{expression is not an integral constant expression}}
+// expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'ordered' clause}}
+// expected-error@+1 2 {{argument to 'ordered' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd ordered(foobool(argc)), ordered(true), ordered(-5)
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+#pragma omp target parallel for simd ordered(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-note@+2 {{read of non-const variable 'j' is not allowed in a constant expression}}
+// expected-error@+1 {{expression is not an integral constant expression}}
+#pragma omp target parallel for simd ordered(j = 2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++)
+    argv[0][i] = argv[0][i] - argv[0][i - 4];
+// expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for simd ordered(ordered(tmain < int, char, -1, -2 > (argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+#pragma omp target parallel for simd ordered(2) // expected-note {{as specified in 'ordered' clause}}
+  foo();                            // expected-error {{expected 2 for loops after '#pragma omp target parallel for simd'}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_private_messages.cpp b/test/OpenMP/target_parallel_for_simd_private_messages.cpp
new file mode 100644
index 0000000000000..57262a58598c5
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_private_messages.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target parallel for simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target parallel for simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(e, g)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+#pragma omp target parallel for simd private(i)
+    for (int k = 0; k < argc; ++k) {
+      i = k;
+      v += i;
+    }
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  return 0;
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target parallel for simd private // expected-error {{expected '(' after 'private'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private() // expected-error {{expected expression}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argc)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(S1) // expected-error {{'S1' does not refer to a value}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(argv[1]) // expected-error {{expected variable name}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp parallel
+  {
+    int i;
+#pragma omp target parallel for simd private(i)
+    for (int k = 0; k < argc; ++k)
+      ++k;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target parallel for simd private(j)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+#pragma omp target parallel for simd private(i)
+  for (int k = 0; k < argc; ++k)
+    ++k;
+  static int m;
+#pragma omp target parallel for simd private(m)
+  for (int k = 0; k < argc; ++k)
+    m = k + 2;
+
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp b/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp
new file mode 100644
index 0000000000000..5bb6d92cd7e9e
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_proc_bind_messages.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  int i;
+#pragma omp target parallel for simd proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind() // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel for simd' cannot contain more than one 'proc_bind' clause}}
+  for (i = 0; i < argc; ++i)
+    foo();
+#pragma omp target parallel for simd proc_bind(x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp target parallel for simd proc_bind(master)
+  for (i = 0; i < argc; ++i)
+    foo();
+
+#pragma omp parallel proc_bind(close)
+#pragma omp target parallel for simd proc_bind(spread)
+  for (i = 0; i < argc; ++i)
+    foo();
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp b/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp
new file mode 100644
index 0000000000000..3999d38162843
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_reduction_messages.cpp
@@ -0,0 +1,313 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                        // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];         // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];               // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for simd reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                      // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                       // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];            // expected-note {{'r' defined here}}
+  int &q = qa[i];                  // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel for simd reduction // expected-error {{expected '(' after 'reduction'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : argc)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{invalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp target parallel for simd reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel shared(i)
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel for simd reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel private(fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+#pragma omp parallel reduction(* : fl)
+#pragma omp target parallel for simd reduction(+ : fl)
+  for (int i = 0; i < 10; ++i)
+    foo();
+  static int m;
+#pragma omp target parallel for simd reduction(+ : m) // OK
+  for (int i = 0; i < 10; ++i)
+    m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp b/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp
new file mode 100644
index 0000000000000..f99010179dc30
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_safelen_messages.cpp
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+  #pragma omp target parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+  // expected-error@+2 2 {{expression is not an integral constant expression}}
+  // expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+  #pragma omp target parallel for simd safelen (argc 
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd safelen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+6 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd safelen (N) // expected-error {{argument to 'safelen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for simd safelen // expected-error {{expected '(' after 'safelen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp target parallel for simd safelen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd safelen (foobool(argc)), safelen (true), safelen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+  #pragma omp target parallel for simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd safelen(safelen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp b/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp
new file mode 100644
index 0000000000000..f0d86e952de6a
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_schedule_messages.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 {{expected ')'}} expected-note@+1 {{to match this '('}}
+  #pragma omp target parallel for simd schedule (guided argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (static, ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (dynamic, 1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (guided, (ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (static, foobool(argc)), schedule (dynamic, true), schedule (guided, -5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (static, S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  // expected-error@+1 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for simd schedule (guided, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (dynamic, 1)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  #pragma omp target parallel for simd schedule (static, N) // expected-error {{argument to 'schedule' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel for simd schedule // expected-error {{expected '(' after 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule ( // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule () // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto_dynamic // expected-error {{expected 'static', 'dynamic', 'guided', 'auto', 'runtime', 'monotonic', 'nonmonotonic' or 'simd' in OpenMP clause 'schedule'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (auto,  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (runtime, 3)  // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (guided, 4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (static, 2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (dynamic, foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'schedule' clause}}
+  // expected-error@+1 {{argument to 'schedule' clause must be a strictly positive integer value}}
+  #pragma omp target parallel for simd schedule (guided, foobool(argc)), schedule (static, true), schedule (dynamic, -5)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  #pragma omp target parallel for simd schedule (guided, S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+1 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  #pragma omp target parallel for simd schedule (static, argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+  // expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+  #pragma omp target parallel for simd schedule(dynamic, schedule(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 1, 0>' requested here}}
+  return tmain<int, char, 1, 0>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp b/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp
new file mode 100644
index 0000000000000..e51e67b6b45f2
--- /dev/null
+++ b/test/OpenMP/target_parallel_for_simd_simdlen_messages.cpp
@@ -0,0 +1,142 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
+
+void foo() {
+}
+
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, typename S, int N, int ST> // expected-note {{declared here}}
+T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
+#pragma omp target parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+3 {{expected ')'}} expected-note@+3 {{to match this '('}}
+// expected-error@+2 2 {{expression is not an integral constant expression}}
+// expected-note@+1 2 {{read of non-const variable 'argc' is not allowed in a constant expression}}
+#pragma omp target parallel for simd simdlen (argc
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+// expected-error@+1 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen (ST // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (1)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen ((ST > 0) ? 1 + ST : 2)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  // expected-error@+6 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (S) // expected-error {{'S' does not refer to a value}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+ #pragma omp target parallel for simd simdlen (4)
+  for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  #pragma omp target parallel for simd simdlen (N) // expected-error {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (2), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (4), safelen (4) // OK
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+#pragma omp target parallel for simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (T i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+#pragma omp target parallel for simd simdlen // expected-error {{expected '(' after 'simdlen'}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen () // expected-error {{expected expression}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (4 // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel for simd' are ignored}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+#pragma omp target parallel for simd simdlen (foobool(1) > 0 ? 1 : 2)
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+// expected-error@+2 2 {{directive '#pragma omp target parallel for simd' cannot contain more than one 'simdlen' clause}}
+// expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+#pragma omp target parallel for simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
+#pragma omp target parallel for simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+// expected-error@+3 {{statement after '#pragma omp target parallel for simd' must be a for loop}}
+// expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, -1, -2>' requested here}}
+#pragma omp target parallel for simd simdlen(simdlen(tmain<int, char, -1, -2>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}}
+  foo();
+
+#pragma omp target parallel for simd simdlen (2), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (4), safelen (4) // OK
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+#pragma omp target parallel for simd simdlen (8), safelen (4) // expected-error{{the value of 'simdlen' parameter must be less than or equal to the value of the 'safelen' parameter}}
+  for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
+
+  // expected-note@+1 {{in instantiation of function template specialization 'tmain<int, char, 12, 4>' requested here}}
+  return tmain<int, char, 12, 4>(argc, argv);
+}
+
diff --git a/test/OpenMP/target_parallel_if_messages.cpp b/test/OpenMP/target_parallel_if_messages.cpp
new file mode 100644
index 0000000000000..e22eb8150af65
--- /dev/null
+++ b/test/OpenMP/target_parallel_if_messages.cpp
@@ -0,0 +1,103 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  #pragma omp target parallel if // expected-error {{expected '(' after 'if'}}
+  foo();
+  #pragma omp target parallel if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel if (argc > 0 ? argv[1] : argv[2])
+  foo();
+  #pragma omp target parallel if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause}}
+  foo();
+  #pragma omp target parallel if (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target parallel if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(argc)
+  foo();
+  #pragma omp target parallel if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(target : argc)
+  foo();
+  #pragma omp target parallel if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel'}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel if // expected-error {{expected '(' after 'if'}}
+  foo();
+  #pragma omp target parallel if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel if (argc > 0 ? argv[1] : argv[2])
+  foo();
+  #pragma omp target parallel if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause}}
+  foo();
+  #pragma omp target parallel if (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(target : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel if(parallel : argc)
+  foo();
+  #pragma omp target parallel if(target : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target parallel'}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (target :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'target' name modifier}}
+  foo();
+  #pragma omp target parallel if(parallel : argc) if (parallel :argc) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'if' clause with 'parallel' name modifier}}
+  foo();
+  #pragma omp target parallel if(target : argc) if (argc) // expected-error {{expected  'parallel' directive name modifier}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+  #pragma omp target parallel if(target : argc) if(parallel : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}} expected-note {{previous clause with directive name modifier specified here}}
+  foo();
+
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_parallel_map_messages.cpp b/test/OpenMP/target_parallel_map_messages.cpp
new file mode 100644
index 0000000000000..ff20567185bf5
--- /dev/null
+++ b/test/OpenMP/target_parallel_map_messages.cpp
@@ -0,0 +1,272 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to, tofrom, always;
+  const T (&l)[5] = da;
+
+
+#pragma omp target parallel map // expected-error {{expected '(' after 'map'}}
+  foo();
+#pragma omp target parallel map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map() // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  foo();
+#pragma omp target parallel map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(to:) // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(tofrom: t[:I])
+  foo();
+#pragma omp target parallel map(T: a) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}} expected-error {{incomplete type 'S1' where a complete type is required}}
+  foo();
+#pragma omp target parallel map(T) // expected-error {{'T' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(to: x)
+  foo();
+#pragma omp target parallel map(to: to)
+  foo();
+#pragma omp target parallel map(to)
+  foo();
+#pragma omp target parallel map(to, x)
+  foo();
+#pragma omp target parallel map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(tofrom: argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+  foo();
+#pragma omp target parallel map(argc)
+  foo();
+#pragma omp target parallel map(S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ca)
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(e, g)
+  foo();
+#pragma omp target parallel map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  foo();
+#pragma omp target parallel map(k), map(k) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target parallel map(k), map(k[:5]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(da[:4])
+  foo();
+#pragma omp target data map(k, j, l) // expected-note 2 {{used here}}
+#pragma omp target parallel map(k[:4]) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) map(l[:5]) // expected-error 2 {{variable already marked as mapped in current construct}} expected-note 2 {{used here}}
+  foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 4 {{used here}}
+{
+#pragma omp target parallel map(k) // expected-error 2 {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) // expected-error 2 {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  foo();
+}
+
+#pragma omp target parallel map(always, tofrom: x)
+  foo();
+#pragma omp target parallel map(always: x) // expected-error {{missing map type}}
+  foo();
+#pragma omp target parallel map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(always, tofrom: always, tofrom, x)
+  foo();
+#pragma omp target parallel map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to, tofrom, always;
+  const int (&l)[5] = da;
+#pragma omp target parallel map // expected-error {{expected '(' after 'map'}}
+  foo();
+#pragma omp target parallel map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map() // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
+  foo();
+#pragma omp target parallel map(to argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(to:) // expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel map(from: argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel map(x: y) // expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(x)
+  foo();
+#pragma omp target parallel map(to: x)
+  foo();
+#pragma omp target parallel map(to: to)
+  foo();
+#pragma omp target parallel map(to)
+  foo();
+#pragma omp target parallel map(to, x)
+  foo();
+#pragma omp target parallel map(to x) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+#pragma omp target parallel map(tofrom: argc > 0 ? argv[1] : argv[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+  foo();
+#pragma omp target parallel map(argc)
+  foo();
+#pragma omp target parallel map(S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel map(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(argv[1])
+  foo();
+#pragma omp target parallel map(ba) // expected-error 2 {{type 'S2' is not mappable to target}}
+  foo();
+#pragma omp target parallel map(ca)
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(S2::S2s)
+  foo();
+#pragma omp target parallel map(S2::S2sc)
+  foo();
+#pragma omp target parallel map(e, g)
+  foo();
+#pragma omp target parallel map(h) // expected-error {{threadprivate variables are not allowed in 'map' clause}}
+  foo();
+#pragma omp target parallel map(k), map(k) // expected-error {{variable already marked as mapped in current construct}} expected-note {{used here}}
+  foo();
+#pragma omp target parallel map(k), map(k[:5]) // expected-error {{pointer cannot be mapped along with a section derived from itself}} expected-note {{used here}}
+  foo();
+#pragma omp target parallel map(da)
+  foo();
+#pragma omp target parallel map(da[:4])
+  foo();
+#pragma omp target data map(k, j, l) // expected-note {{used here}}
+#pragma omp target parallel map(k[:4]) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) map(l[:5]) // expected-error 1 {{variable already marked as mapped in current construct}} expected-note 1 {{used here}}
+  foo();
+#pragma omp target data map(k[:4], j, l[:5]) // expected-note 2 {{used here}}
+{
+#pragma omp target parallel map(k) // expected-error {{pointer cannot be mapped along with a section derived from itself}}
+  foo();
+#pragma omp target parallel map(j)
+  foo();
+#pragma omp target parallel map(l) // expected-error {{original storage of expression in data environment is shared but data environment do not fully contain mapped expression storage}}
+  foo();
+}
+
+#pragma omp target parallel map(always, tofrom: x)
+  foo();
+#pragma omp target parallel map(always: x) // expected-error {{missing map type}}
+  foo();
+#pragma omp target parallel map(tofrom, always: x) // expected-error {{incorrect map type modifier, expected 'always'}} expected-error {{incorrect map type, expected one of 'to', 'from', 'tofrom', 'alloc', 'release', or 'delete'}}
+  foo();
+#pragma omp target parallel map(always, tofrom: always, tofrom, x)
+  foo();
+#pragma omp target parallel map(tofrom j) // expected-error {{expected ',' or ')' in 'map' clause}}
+  foo();
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_messages.cpp b/test/OpenMP/target_parallel_messages.cpp
new file mode 100644
index 0000000000000..b6763d8f09e03
--- /dev/null
+++ b/test/OpenMP/target_parallel_messages.cpp
@@ -0,0 +1,73 @@
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -o - %s
+// RUN: not %clang_cc1 -fopenmp -std=c++11 -fopenmp-targets=aaa-bbb-ccc-ddd -o - %s 2>&1 | FileCheck %s
+// CHECK: error: OpenMP target is invalid: 'aaa-bbb-ccc-ddd'
+
+void foo() {
+}
+
+static int pvt;
+#pragma omp threadprivate(pvt)
+
+#pragma omp target parallel // expected-error {{unexpected OpenMP directive '#pragma omp target parallel'}}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel { // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ( // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel [ // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ] // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel ) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel } // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel
+  foo();
+  // expected-warning@+1 {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  #pragma omp target parallel unknown()
+  foo();
+  L1:
+    foo();
+  #pragma omp target parallel
+  ;
+  #pragma omp target parallel
+  {
+    goto L1; // expected-error {{use of undeclared label 'L1'}}
+    argc++;
+  }
+
+  for (int i = 0; i < 10; ++i) {
+    switch(argc) {
+     case (0):
+      #pragma omp target parallel
+      {
+        foo();
+        break; // expected-error {{'break' statement not in loop or switch statement}}
+        continue; // expected-error {{'continue' statement not in loop statement}}
+      }
+      default:
+       break;
+    }
+  }
+
+  goto L2; // expected-error {{use of undeclared label 'L2'}}
+  #pragma omp target parallel
+  L2:
+  foo();
+  #pragma omp target parallel
+  {
+    return 1; // expected-error {{cannot return from OpenMP region}}
+  }
+
+  [[]] // expected-error {{an attribute list cannot appear here}}
+  #pragma omp target parallel
+  for (int n = 0; n < 100; ++n) {}
+
+  #pragma omp target parallel copyin(pvt) // expected-error {{unexpected OpenMP clause 'copyin' in directive '#pragma omp target parallel'}}
+  foo();
+
+  return 0;
+}
+
diff --git a/test/OpenMP/target_parallel_nowait_messages.cpp b/test/OpenMP/target_parallel_nowait_messages.cpp
new file mode 100644
index 0000000000000..91e26f20819f6
--- /dev/null
+++ b/test/OpenMP/target_parallel_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo() {
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel nowait( // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel nowait device (-10u)
+  foo();
+  #pragma omp target parallel nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_num_threads_messages.cpp b/test/OpenMP/target_parallel_num_threads_messages.cpp
new file mode 100644
index 0000000000000..95797caf29622
--- /dev/null
+++ b/test/OpenMP/target_parallel_num_threads_messages.cpp
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+#define redef_num_threads(a, b) num_threads(a)
+
+template <class T, typename S, int N> // expected-note {{declared here}}
+T tmain(T argc, S **argv) {
+  #pragma omp target parallel num_threads // expected-error {{expected '(' after 'num_threads'}}
+  foo();
+  #pragma omp target parallel num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel num_threads ((argc > 0) ? argv[1] : argv[2]) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel num_threads (S) // expected-error {{'S' does not refer to a value}}
+  foo();
+  #pragma omp target parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error 2 {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (argc)
+  foo();
+  #pragma omp target parallel num_threads (N) // expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel redef_num_threads (argc, argc)
+  foo();
+
+  return argc;
+}
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel num_threads // expected-error {{expected '(' after 'num_threads'}}
+  foo();
+  #pragma omp target parallel num_threads ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel num_threads (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel num_threads (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+  #pragma omp target parallel num_threads (argc > 0 ? argv[1] : argv[2]) // expected-error {{integral }}
+  foo();
+  #pragma omp target parallel num_threads (foobool(argc)), num_threads (true), num_threads (-5) // expected-error 2 {{directive '#pragma omp target parallel' cannot contain more than one 'num_threads' clause}} expected-error {{argument to 'num_threads' clause must be a strictly positive integer value}}
+  foo();
+  #pragma omp target parallel num_threads (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel num_threads (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+  foo();
+  #pragma omp target parallel num_threads (num_threads(tmain<int, char, -1>(argc, argv) // expected-error 2 {{expected ')'}} expected-note 2 {{to match this '('}} expected-note {{in instantiation of function template specialization 'tmain<int, char, -1>' requested here}}
+  foo();
+  #pragma omp target parallel redef_num_threads (argc, argc)
+  foo();
+
+  return tmain<int, char, 3>(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char, 3>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_private_messages.cpp b/test/OpenMP/target_parallel_private_messages.cpp
new file mode 100644
index 0000000000000..fabd37d4b3950
--- /dev/null
+++ b/test/OpenMP/target_parallel_private_messages.cpp
@@ -0,0 +1,222 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+void foo() {
+}
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}} expected-note 1 {{forward declaration of 'S1'}} expected-note {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+  static float S2s; // expected-note {{static data member is predetermined as shared}} expected-note 1 {{static data member is predetermined as shared}}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 c; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}}
+const S3 ca[5]; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}}
+extern const int f; // expected-note {{global variable is predetermined as shared}} expected-note 1 {{global variable is predetermined as shared}} 
+
+int threadvar;
+#pragma omp threadprivate(threadvar) // expected-note {{defined as threadprivate or thread local}} expected-note 1 {{defined as threadprivate or thread local}}
+
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}} expected-note 1 {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {}
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}} expected-note 1 {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+};
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}} expected-note 1 {{defined as threadprivate or thread local}} expected-note 2 {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C, class D, class E>
+int foomain(I argc, C **argv) {
+  const I d = 5; // expected-note {{constant variable is predetermined as shared}}
+  const I da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
+  D e(4);
+  E g[] = {5, 6};
+  I i;
+  I &j = i;
+#pragma omp target parallel private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target parallel private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private() // expected-error {{expected expression}}
+{}
+#pragma omp target parallel private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(argc argv) // expected-error {{expected ',' or ')' in 'private' clause}}
+{}
+#pragma omp target parallel private(argc)
+{}
+#pragma omp target parallel private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target parallel private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target parallel private (a, b, c, d, f) // expected-error {{a private variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(ba)
+{}
+#pragma omp target parallel private(ca) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(da) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(S2::S2s) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+  #pragma omp target parallel private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  {}
+  #pragma omp target parallel shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
+  foo();
+  #pragma omp target parallel firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  {}
+  #pragma omp target parallel private(j)
+  foo();
+  #pragma omp parallel firstprivate(i)
+  for (int k = 0; k < 10; ++k) {
+    #pragma omp target parallel private(i)
+    foo();
+  }
+  static int m;
+  #pragma omp target parallel private(m) // OK
+  foo();
+#pragma omp target parallel private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target parallel private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp target parallel shared(i)
+{}
+#pragma omp target parallel private(i)
+{}
+#pragma omp target parallel private(j)
+{}
+#pragma omp target parallel private(i)
+{}
+  static int si;
+#pragma omp target parallel private(si) // OK
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target parallel private(a)
+  {}
+}
+
+int main(int argc, char **argv) {
+  const int d = 5; // expected-note {{constant variable is predetermined as shared}}
+  const int da[5] = { 0 }; // expected-note {{constant variable is predetermined as shared}}
+  S4 e(4);
+  S5 g[] = {5, 6};
+  int i;
+  int &j = i;
+#pragma omp target parallel private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target parallel private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private() // expected-error {{expected expression}}
+{}
+#pragma omp target parallel private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target parallel private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(argc argv) // expected-error {{expected ',' or ')' in 'private' clause}}
+{}
+#pragma omp target parallel private(argc)
+{}
+#pragma omp target parallel private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target parallel private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target parallel private (a, b, c, d, f) // expected-error {{a private variable with incomplete type 'S1'}} expected-error 3 {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target parallel private(ba)
+{}
+#pragma omp target parallel private(ca) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(da) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(S2::S2s) // expected-error {{shared variable cannot be private}}
+{}
+#pragma omp target parallel private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+  #pragma omp target parallel private(threadvar, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be private}}
+  {}
+  #pragma omp target parallel shared(i), private(i) // expected-error {{shared variable cannot be private}} expected-note {{defined as shared}}
+  foo();
+  #pragma omp target parallel firstprivate(i) private(i) // expected-error {{firstprivate variable cannot be private}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  {}
+  #pragma omp target parallel private(j)
+  foo();
+  #pragma omp parallel firstprivate(i)
+  for (int k = 0; k < 10; ++k) {
+    #pragma omp target parallel private(i)
+    foo();
+  }
+  static int m;
+  #pragma omp target parallel private(m) // OK
+  foo();
+#pragma omp target parallel private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target parallel private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp target parallel shared(i)
+{}
+#pragma omp target parallel private(i)
+{}
+#pragma omp target parallel private(j)
+{}
+#pragma omp target parallel private(i)
+{}
+  static int si;
+#pragma omp target parallel private(si) // OK
+  {}
+  return foomain<int, char, S4, S5>(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char, S4, S5>' requested here}}
+}
+
diff --git a/test/OpenMP/target_parallel_proc_bind_messages.cpp b/test/OpenMP/target_parallel_proc_bind_messages.cpp
new file mode 100644
index 0000000000000..56292ad437032
--- /dev/null
+++ b/test/OpenMP/target_parallel_proc_bind_messages.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - %s
+
+void foo();
+
+int main(int argc, char **argv) {
+  #pragma omp target parallel proc_bind // expected-error {{expected '(' after 'proc_bind'}}
+  foo();
+  #pragma omp target parallel proc_bind ( // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel proc_bind () // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  foo();
+  #pragma omp target parallel proc_bind (master // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel proc_bind (close), proc_bind(spread) // expected-error {{directive '#pragma omp target parallel' cannot contain more than one 'proc_bind' clause}}
+  foo();
+  #pragma omp target parallel proc_bind (x) // expected-error {{expected 'master', 'close' or 'spread' in OpenMP clause 'proc_bind'}}
+  foo();
+
+  #pragma omp target parallel proc_bind(master)
+  ++argc;
+
+  #pragma omp target parallel proc_bind(close)
+  foo();
+  #pragma omp target parallel proc_bind(spread)
+  ++argc;
+  return 0;
+}
diff --git a/test/OpenMP/target_parallel_reduction_messages.cpp b/test/OpenMP/target_parallel_reduction_messages.cpp
new file mode 100644
index 0000000000000..c9434e76245b9
--- /dev/null
+++ b/test/OpenMP/target_parallel_reduction_messages.cpp
@@ -0,0 +1,263 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 -ferror-limit 150 -o - %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 -ferror-limit 150 -o - %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}} expected-note 4 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
+
+public:
+  S2() : a(0) {}
+  S2(S2 &s2) : a(s2.a) {}
+  static float S2s; // expected-note 2 {{static data member is predetermined as shared}}
+  static const float S2sc;
+};
+const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
+const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
+class S3 {
+  int a;
+
+public:
+  int b;
+  S3() : a(0) {}
+  S3(const S3 &s3) : a(s3.a) {}
+  S3 operator+(const S3 &arg1) { return arg1; }
+};
+int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
+S3 c;               // expected-note 3 {{'c' defined here}}
+const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
+extern const int f; // expected-note 4 {{'f' declared here}}
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+  S4(const S4 &s4);
+  S4 &operator+(const S4 &arg) { return (*this); }
+
+public:
+  S4(int v) : a(v) {}
+};
+S4 &operator&=(S4 &arg1, S4 &arg2) { return arg1; }
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+  S5(const S5 &s5) : a(s5.a) {}
+  S5 &operator+(const S5 &arg);
+
+public:
+  S5(int v) : a(v) {}
+};
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+#if __cplusplus >= 201103L // C++11 or later
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
+#endif
+  int a;
+
+public:
+  S6() : a(6) {}
+  operator int() { return 6; }
+} o;
+
+S3 h, k;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class T>       // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = T();       // expected-note 4 {{'d' defined here}}
+  const T da[5] = {T()}; // expected-note 2 {{'da' defined here}}
+  T qa[5] = {T()};
+  T i;
+  T &j = i;                    // expected-note 4 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const T &r = da[(int)i];     // expected-note 2 {{'r' defined here}}
+  T &q = qa[(int)i];           // expected-note 2 {{'q' defined here}}
+  T fl;
+#pragma omp target parallel reduction // expected-error {{expected '(' after 'reduction'}}
+  foo();
+#pragma omp target parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+#pragma omp target parallel reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(& : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
+  foo();
+#pragma omp target parallel reduction(&& : argc)
+  foo();
+#pragma omp target parallel reduction(^ : T) // expected-error {{'T' does not refer to a value}}
+  foo();
+#pragma omp target parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp target parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
+  foo();
+#pragma omp target parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
+  foo();
+#pragma omp target parallel reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel shared(i)
+  foo();
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel reduction(max : j) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+#pragma omp target parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+
+  return T();
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;       // expected-note 2 {{'d' defined here}}
+  const int da[5] = {0}; // expected-note {{'da' defined here}}
+  int qa[5] = {0};
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;                  // expected-note 2 {{'j' defined here}}
+  S3 &p = k;                   // expected-note 2 {{'p' defined here}}
+  const int &r = da[i];        // expected-note {{'r' defined here}}
+  int &q = qa[i];              // expected-note {{'q' defined here}}
+  float fl;
+#pragma omp target parallel reduction // expected-error {{expected '(' after 'reduction'}}
+  foo();
+#pragma omp target parallel reduction + // expected-error {{expected '(' after 'reduction'}} expected-warning {{extra tokens at the end of '#pragma omp target parallel' are ignored}}
+  foo();
+#pragma omp target parallel reduction( // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(- // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction() // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(*) // expected-warning {{missing ':' after reduction identifier - ignoring}} expected-error {{expected expression}}
+  foo();
+#pragma omp target parallel reduction(\) // expected-error {{expected unqualified-id}} expected-warning {{missing ':' after reduction identifier - ignoring}}
+  foo();
+#pragma omp target parallel reduction(foo : argc // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+  foo();
+#pragma omp target parallel reduction(| : argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+#pragma omp target parallel reduction(|| : argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(~ : argc) // expected-error {{expected unqualified-id}}
+  foo();
+#pragma omp target parallel reduction(&& : argc)
+  foo();
+#pragma omp target parallel reduction(^ : S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+#pragma omp target parallel reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{const-qualified list item cannot be reduction}} expected-error {{'operator+' is a private member of 'S2'}}
+  foo();
+#pragma omp target parallel reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 2 {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
+  foo();
+#pragma omp target parallel reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2s) // expected-error {{shared variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(&& : S2::S2sc) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(& : e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{nvalid operands to binary expression ('S4' and 'S4')}} expected-error {{calling a private constructor of class 'S5'}} expected-error {{invalid operands to binary expression ('S5' and 'S5')}}
+  foo();
+#pragma omp target parallel reduction(+ : h, k, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be reduction}}
+  foo();
+#pragma omp target parallel reduction(+ : o) // expected-error {{no viable overloaded '='}}
+  foo();
+#pragma omp target parallel private(i), reduction(+ : j), reduction(+ : q) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp parallel private(k)
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel reduction(+ : p), reduction(+ : p) // expected-error {{variable can appear only once in OpenMP 'reduction' clause}} expected-note {{previously referenced here}}
+  foo();
+#pragma omp target parallel reduction(+ : r) // expected-error {{const-qualified list item cannot be reduction}}
+  foo();
+#pragma omp target parallel shared(i)
+  foo();
+#pragma omp parallel reduction(min : i)
+#pragma omp target parallel reduction(max : j) // expected-error {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
+  foo();
+#pragma omp target parallel
+#pragma omp for private(fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+#pragma omp target parallel
+#pragma omp for reduction(- : fl)
+  for (int i = 0; i < 10; ++i)
+  {}
+#pragma omp target parallel reduction(+ : fl)
+    foo();
+  static int m;
+#pragma omp target parallel reduction(+ : m) // OK
+  m++;
+
+  return tmain(argc) + tmain(fl); // expected-note {{in instantiation of function template specialization 'tmain<int>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<float>' requested here}}
+}
diff --git a/test/OpenMP/target_parallel_shared_messages.cpp b/test/OpenMP/target_parallel_shared_messages.cpp
new file mode 100644
index 0000000000000..302a09239d341
--- /dev/null
+++ b/test/OpenMP/target_parallel_shared_messages.cpp
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note {{defined as threadprivate or thread local}}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i;
+  int &j = i;
+  #pragma omp target parallel shared // expected-error {{expected '(' after 'shared'}}
+  foo();
+  #pragma omp target parallel shared ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared () // expected-error {{expected expression}}
+  foo();
+  #pragma omp target parallel shared (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared (argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  foo();
+  #pragma omp target parallel shared (argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel shared (argc)
+  foo();
+  #pragma omp target parallel shared (S1) // expected-error {{'S1' does not refer to a value}}
+  foo();
+  #pragma omp target parallel shared (a, b, c, d, f)
+  foo();
+  #pragma omp target parallel shared (argv[1]) // expected-error {{expected variable name}}
+  foo();
+  #pragma omp target parallel shared(ba)
+  foo();
+  #pragma omp target parallel shared(ca)
+  foo();
+  #pragma omp target parallel shared(da)
+  foo();
+  #pragma omp target parallel shared(e, g)
+  foo();
+  #pragma omp target parallel shared(h, B::x) // expected-error 2 {{threadprivate or thread local variable cannot be shared}}
+  foo();
+  #pragma omp target parallel private(i), shared(i) // expected-error {{private variable cannot be shared}} expected-note {{defined as private}}
+  foo();
+  #pragma omp target parallel firstprivate(i), shared(i) // expected-error {{firstprivate variable cannot be shared}} expected-note {{defined as firstprivate}}
+  foo();
+  #pragma omp target parallel private(i)
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel shared(j)
+  foo();
+  #pragma omp target parallel firstprivate(i)
+  foo();
+  #pragma omp target parallel shared(i)
+  foo();
+  #pragma omp target parallel shared(j)
+  foo();
+
+  return 0;
+}
diff --git a/test/OpenMP/target_private_codegen.cpp b/test/OpenMP/target_private_codegen.cpp
new file mode 100644
index 0000000000000..5c738ee5930f9
--- /dev/null
+++ b/test/OpenMP/target_private_codegen.cpp
@@ -0,0 +1,264 @@
+// Only test codegen on target side, as private clause does not require any action on the host side
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCHECK --check-prefix TCHECK-32
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+template<typename tx, typename ty>
+struct TT{
+  tx X;
+  ty Y;
+};
+
+// TCHECK: [[TT:%.+]] = type { i64, i8 }
+// TCHECK: [[S1:%.+]] = type { double }
+
+int foo(int n) {
+  int a = 0;
+  short aa = 0;
+  float b[10];
+  float bn[n];
+  double c[5][10];
+  double cn[5][n];
+  TT<long long, char> d;
+
+  #pragma omp target private(a)
+  {
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK-NOT: store {{.+}}, {{.+}} [[A]],
+  // TCHECK:  ret void  
+
+#pragma omp target private(a)
+  {
+    a = 1;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+  // TCHECK:  ret void
+  
+  #pragma omp target private(a, aa)
+  {
+    a = 1;
+    aa = 1;
+  }
+
+  // TCHECK:  define void @__omp_offloading_{{.+}}()
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+  // TCHECK:  ret void
+
+  #pragma omp target private(a, b, bn, c, cn, d)
+  {
+    a = 1;
+    b[2] = 1.0;
+    bn[3] = 1.0;
+    c[1][2] = 1.0;
+    cn[1][3] = 1.0;
+    d.X = 1;
+    d.Y = 1;
+  }
+  // make sure that private variables are generated in all cases and that we use those instances for operations inside the
+  // target region
+  // TCHECK:  define void @__omp_offloading_{{.+}}(i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]], i{{[0-9]+}} [[VLA3:%.+]])
+  // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK:  [[B:%.+]] = alloca [10 x float],
+  // TCHECK:  [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK:  [[C:%.+]] = alloca [5 x [10 x double]],
+  // TCHECK:  [[D:%.+]] = alloca [[TT]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  store i{{[0-9]+}} [[VLA3]], i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK:  [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK:  [[VLA_ADDR_REF4:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR4]],
+  // TCHECK:  [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK:  store i8* [[RET_STACK]], i8** [[SSTACK]],
+  // TCHECK:  [[VLA5:%.+]] = alloca float, i{{[0-9]+}} [[VLA_ADDR_REF]],
+  // TCHECK:  [[VLA6_SIZE:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF2]], [[VLA_ADDR_REF4]]
+  // TCHECK:  [[VLA6:%.+]] = alloca double, i{{[0-9]+}} [[VLA6_SIZE]],
+
+  // a = 1
+  // TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+
+  // b[2] = 1.0
+  // TCHECK:  [[B_GEP:%.+]] = getelementptr inbounds [10 x float], [10 x float]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // TCHECK:  store float 1.0{{.*}}, float* [[B_GEP]],
+  
+  // bn[3] = 1.0
+  // TCHECK:  [[BN_GEP:%.+]] = getelementptr inbounds float, float* [[VLA5]], i{{[0-9]+}} 3
+  // TCHECK:  store float 1.0{{.*}}, float* [[BN_GEP]],
+
+  // c[1][2] = 1.0
+  // TCHECK:  [[C_GEP1:%.+]] = getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[C]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // TCHECK:  [[C_GEP2:%.+]] = getelementptr inbounds [10 x double], [10 x double]* [[C_GEP1]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // TCHECK:  store double 1.0{{.*}}, double* [[C_GEP2]],
+
+  // cn[1][3] = 1.0
+  // TCHECK:  [[CN_IND:%.+]] = mul{{.+}} i{{[0-9]+}} 1, [[VLA_ADDR_REF4]]
+  // TCHECK:  [[CN_GEP_IND:%.+]] = getelementptr inbounds double, double* [[VLA6]], i{{[0-9]+}} [[CN_IND]]
+  // TCHECK:  [[CN_GEP_3:%.+]] = getelementptr inbounds double, double* [[CN_GEP_IND]], i{{[0-9]+}} 3
+  // TCHECK:  store double 1.0{{.*}}, double* [[CN_GEP_3]],
+
+  // d.X = 1
+  // [[X_FIELD:%.+]] = getelementptr inbounds [[TT]] [[TT]]* [[D]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // store i{{[0-9]+}} 1, i{{[0-9]+}}* [[X_FIELD]],
+  
+  // d.Y = 1
+  // [[Y_FIELD:%.+]] = getelementptr inbounds [[TT]] [[TT]]* [[D]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // store i{{[0-9]+}} 1, i{{[0-9]+}}* [[Y_FIELD]],
+
+  // finish
+  // [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // call ovid @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // ret void
+
+  return a;
+}
+
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+#pragma omp target private(a,aa,b)
+  {
+    a = 1;
+    aa = 1;
+    b[2] = 1;
+  }
+
+  return a;
+}
+
+static
+int fstatic(int n) {
+  int a = 0;
+  short aa = 0;
+  char aaa = 0;
+  int b[10];
+
+#pragma omp target private(a,aa,aaa,b)
+  {
+    a = 1;
+    aa = 1;
+    aaa = 1;
+    b[2] = 1;
+  }
+
+  return a;
+}
+
+// TCHECK: define void @__omp_offloading_{{.+}}()
+// TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[A3:%.+]] = alloca i{{[0-9]+}},
+// TCHECK:  [[B:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A3]],
+// TCHECK:  [[B_GEP:%.+]] = getelementptr inbounds [10 x i{{[0-9]+}}], [10 x i{{[0-9]+}}]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// TCHECK:  store i{{[0-9]+}} 1, i{{[0-9]+}}* [[B_GEP]],
+// TCHECK:  ret void
+
+struct S1 {
+  double a;
+
+  int r1(int n){
+    int b = n+1;
+    short int c[2][n];
+
+#pragma omp target private(b,c)
+    {
+      this->a = (double)b + 1.5;
+      c[1][1] = ++a;
+    }
+
+    return c[1][1] + (int)b;
+  }
+
+  // TCHECK: define void @__omp_offloading_{{.+}}([[S1]]* [[TH:%.+]], i{{[0-9]+}} [[VLA:%.+]], i{{[0-9]+}} [[VLA1:%.+]])
+  // TCHECK: [[TH_ADDR:%.+]] = alloca [[S1]]*,
+  // TCHECK: [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[B:%.+]] = alloca i{{[0-9]+}},
+  // TCHECK: [[SSTACK:%.+]] = alloca i8*,
+  // TCHECK: store [[S1]]* [[TH]], [[S1]]** [[TH_ADDR]],
+  // TCHECK: store i{{[0-9]+}} [[VLA]], i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK: store i{{[0-9]+}} [[VLA1]], i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK: [[TH_ADDR_REF:%.+]] = load [[S1]]*, [[S1]]** [[TH_ADDR]],
+  // TCHECK: [[VLA_ADDR_REF:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR]],
+  // TCHECK: [[VLA_ADDR_REF2:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[VLA_ADDR2]],
+  // TCHECK: [[RET_STACK:%.+]] = call i8* @llvm.stacksave()
+  // TCHECK: store i8* [[RET_STACK:%.+]], i8** [[SSTACK]],
+
+  // this->a = (double)b + 1.5;
+  // TCHECK: [[VLA_IND:%.+]] = mul{{.+}} i{{[0-9]+}} [[VLA_ADDR_REF]], [[VLA_ADDR_REF2]]
+  // TCHECK: [[VLA3:%.+]] = alloca i{{[0-9]+}}, i{{[0-9]+}} [[VLA_IND]],
+  // TCHECK: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B]],
+  // TCHECK: [[B_CONV:%.+]] = sitofp i{{[0-9]+}} [[B_VAL]] to double
+  // TCHECK: [[NEW_A_VAL:%.+]] = fadd double [[B_CONV]], 1.5{{.+}}+00
+  // TCHECK: [[A_FIELD:%.+]] = getelementptr inbounds [[S1]], [[S1]]* [[TH_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // TCHECK: store double [[NEW_A_VAL]], double* [[A_FIELD]],
+
+  // c[1][1] = ++a;
+  // TCHECK: [[A_FIELD4:%.+]] = getelementptr inbounds [[S1]], [[S1]]* [[TH_ADDR_REF]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // TCHECK: [[A_FIELD4_VAL:%.+]] = load double, double* [[A_FIELD4]],
+  // TCHECK: [[A_FIELD_INC:%.+]] = fadd double [[A_FIELD4_VAL]], 1.0{{.+}}+00
+  // TCHECK: store double [[A_FIELD_INC]], double* [[A_FIELD4]],
+  // TCHECK: [[A_FIELD_INC_CONV:%.+]] = fptosi double [[A_FIELD_INC]] to i{{[0-9]+}}
+  // TCHECK: [[C_IND:%.+]] = mul{{.+}} i{{[0-9]+}} 1, [[VLA_ADDR_REF2]]
+  // TCHECK: [[C_1_REF:%.+]] = getelementptr inbounds i{{[0-9]+}}, i{{[0-9]+}}* [[VLA3]], i{{[0-9]+}} [[C_IND]]
+  // TCHECK: [[C_1_1_REF:%.+]] = getelementptr inbounds i{{[0-9]+}}, i{{[0-9]+}}* [[C_1_REF]], i{{[0-9]+}} 1
+  // TCHECK: store i{{[0-9]+}} [[A_FIELD_INC_CONV]], i{{[0-9]+}}* [[C_1_1_REF]],
+
+  // finish
+  // TCHECK: [[RELOAD_SSTACK:%.+]] = load i8*, i8** [[SSTACK]],
+  // TCHECK: call void @llvm.stackrestore(i8* [[RELOAD_SSTACK]])
+  // TCHECK: ret void
+};
+
+
+int bar(int n){
+  int a = 0;
+  a += foo(n);
+  S1 S;
+  a += S.r1(n);
+  a += fstatic(n);
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+// template
+// TCHECK: define void @__omp_offloading_{{.+}}()
+// TCHECK: [[A:%.+]] = alloca i{{[0-9]+}},
+// TCHECK: [[A2:%.+]] = alloca i{{[0-9]+}},
+// TCHECK: [[B:%.+]] = alloca [10 x i{{[0-9]+}}],
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A]],
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[A2]],
+// TCHECK: [[B_GEP:%.+]] = getelementptr inbounds [10 x i{{[0-9]+}}], [10 x i{{[0-9]+}}]* [[B]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// TCHECK: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[B_GEP]],
+// TCHECK: ret void
+
+#endif
diff --git a/test/OpenMP/target_private_messages.cpp b/test/OpenMP/target_private_messages.cpp
new file mode 100644
index 0000000000000..a093a87056272
--- /dev/null
+++ b/test/OpenMP/target_private_messages.cpp
@@ -0,0 +1,198 @@
+// RUN: %clang_cc1 -verify -fopenmp %s
+
+struct S1; // expected-note 2 {{declared here}} expected-note 2 {{forward declaration of 'S1'}}
+extern S1 a;
+class S2 {
+  mutable int a;
+
+public:
+  S2() : a(0) {}
+};
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+
+public:
+  S3() : a(0) {}
+};
+const S3 ca[5];
+class S4 {
+  int a;
+  S4(); // expected-note {{implicitly declared private here}}
+
+public:
+  S4(int v) : a(v) {
+#pragma omp target private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+};
+class S5 {
+  int a;
+  S5() : a(0) {} // expected-note {{implicitly declared private here}}
+
+public:
+  S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp target private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp target private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp target private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+template <class I, class C>
+int foomain(I argc, C **argv) {
+  I e(4);
+  I g(5);
+  int i;
+  int &j = i;
+#pragma omp target private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private() // expected-error {{expected expression}}
+{}
+#pragma omp target private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(argc)
+{}
+#pragma omp target private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(e, g)
+{}
+#pragma omp target private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int v = 0;
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target private(j)
+{}
+#pragma omp target private(i)
+  {}
+  return 0;
+}
+
+void bar(S4 a[2]) {
+#pragma omp parallel
+#pragma omp target private(a)
+  {}
+}
+
+namespace A {
+double x;
+#pragma omp threadprivate(x) // expected-note {{defined as threadprivate or thread local}}
+}
+namespace B {
+using A::x;
+}
+
+int main(int argc, char **argv) {
+  S4 e(4);
+  S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
+  int i;
+  int &j = i;
+#pragma omp target private // expected-error {{expected '(' after 'private'}}
+{}
+#pragma omp target private( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private() // expected-error {{expected expression}}
+{}
+#pragma omp target private(argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc, // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+{}
+#pragma omp target private(argc > 0 ? argv[1] : argv[2]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(argc)
+{}
+#pragma omp target private(S1) // expected-error {{'S1' does not refer to a value}}
+{}
+#pragma omp target private(a, b) // expected-error {{private variable with incomplete type 'S1'}}
+{}
+#pragma omp target private(argv[1]) // expected-error {{expected variable name}}
+{}
+#pragma omp target private(e, g) // expected-error {{calling a private constructor of class 'S4'}} expected-error {{calling a private constructor of class 'S5'}}
+{}
+#pragma omp target private(h) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target private(B::x) // expected-error {{threadprivate or thread local variable cannot be private}}
+{}
+#pragma omp target shared(i) // expected-error {{unexpected OpenMP clause 'shared' in directive '#pragma omp target'}}
+#pragma omp parallel
+  {
+    int i;
+  }
+#pragma omp parallel shared(i)
+#pragma omp parallel private(i)
+#pragma omp target private(j)
+{}
+#pragma omp target private(i)
+  {}
+  static int si;
+#pragma omp target private(si) // OK
+  {}
+#pragma omp target map(i) private(i) // expected-error {{private variable cannot be in a map clause in '#pragma omp target' directive}}
+  {}
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
+}
+
diff --git a/test/OpenMP/target_update_ast_print.cpp b/test/OpenMP/target_update_ast_print.cpp
new file mode 100644
index 0000000000000..3a98f54a0770f
--- /dev/null
+++ b/test/OpenMP/target_update_ast_print.cpp
@@ -0,0 +1,52 @@
+// RUN: %clang_cc1 -verify -fopenmp -ast-print %s | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo() {}
+
+template <class T, class U>
+T foo(T targ, U uarg) {
+  static T a;
+  U b;
+  int l;
+#pragma omp target update to(a) if(l>5) device(l) nowait depend(inout:l)
+
+#pragma omp target update from(b) if(l<5) device(l-1) nowait depend(inout:l)
+  return a + targ + (T)b;
+}
+// CHECK:      static int a;
+// CHECK-NEXT: float b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static char a;
+// CHECK-NEXT: float b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+// CHECK:      static T a;
+// CHECK-NEXT: U b;
+// CHECK-NEXT: int l;
+// CHECK-NEXT: #pragma omp target update to(a) if(l > 5) device(l) nowait depend(inout : l)
+// CHECK-NEXT: #pragma omp target update from(b) if(l < 5) device(l - 1) nowait depend(inout : l)
+
+int main(int argc, char **argv) {
+  static int a;
+  int n;
+  float f;
+
+// CHECK:      static int a;
+// CHECK-NEXT: int n;
+// CHECK-NEXT: float f;
+#pragma omp target update to(a) if(f>0.0) device(n) nowait depend(in:n)
+// CHECK-NEXT: #pragma omp target update to(a) if(f > 0.) device(n) nowait depend(in : n)
+#pragma omp target update from(f) if(f<0.0) device(n+1) nowait depend(in:n)
+// CHECK-NEXT: #pragma omp target update from(f) if(f < 0.) device(n + 1) nowait depend(in : n)
+  return foo(argc, f) + foo(argv[0][0], f) + a;
+}
+
+#endif
diff --git a/test/OpenMP/target_update_codegen.cpp b/test/OpenMP/target_update_codegen.cpp
new file mode 100644
index 0000000000000..f74ed49975374
--- /dev/null
+++ b/test/OpenMP/target_update_codegen.cpp
@@ -0,0 +1,245 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+///==========================================================================///
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+// CK1: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+};
+
+ST<int> gb;
+double gc[100];
+
+// CK1: [[SIZE00:@.+]] = {{.+}}constant [1 x i[[sz:64|32]]] [i{{64|32}} 800]
+// CK1: [[MTYPE00:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE02:@.+]] = {{.+}}constant [1 x i[[sz]]] [i[[sz]] 4]
+// CK1: [[MTYPE02:@.+]] = {{.+}}constant [1 x i32] [i32 33]
+
+// CK1: [[MTYPE03:@.+]] = {{.+}}constant [1 x i32] [i32 34]
+
+// CK1: [[SIZE04:@.+]] = {{.+}}constant [2 x i[[sz]]] [i[[sz]] {{8|4}}, i[[sz]] 24]
+// CK1: [[MTYPE04:@.+]] = {{.+}}constant [2 x i32] [i32 33, i32 17]
+
+// CK1-LABEL: _Z3fooi
+void foo(int arg) {
+  int la;
+  float lb[arg];
+
+  // Region 00
+  // CK1-DAG: call void @__tgt_target_data_update(i32 [[DEV:%[^,]+]], i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE00]]{{.+}})
+  // CK1-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast ([100 x double]* @gc to i8*), i8** [[P0]]
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update if(1+3-5) device(arg) from(gc)
+  {++arg;}
+
+  // Region 01
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update to(la) if(1+3-4)
+  {++arg;}
+
+  // Region 02
+  // CK1: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+  // CK1: [[IFTHEN]]
+  // CK1-DAG: call void @__tgt_target_data_update(i32 4, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[SIZE02]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE02]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast i32* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast i32* [[VAR0]] to i8*
+  // CK1: br label %[[IFEND:[^,]+]]
+
+  // CK1: [[IFELSE]]
+  // CK1: br label %[[IFEND]]
+  // CK1: [[IFEND]]
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  #pragma omp target update to(arg) if(arg) device(4)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 03
+  // CK1-DAG: call void @__tgt_target_data_update(i32 -1, i32 1, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], i[[sz]]* [[GEPS:%.+]], {{.+}}getelementptr {{.+}}[1 x i{{.+}}]* [[MTYPE03]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+  // CK1-DAG: [[GEPS]] = getelementptr inbounds {{.+}}[[S:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[S0:%.+]] = getelementptr inbounds {{.+}}[[S]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+  // CK1-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+  // CK1-DAG: store i[[sz]] [[CSVAL0:%[^,]+]], i[[sz]]* [[S0]]
+  // CK1-DAG: [[CBPVAL0]] = bitcast float* [[VAR0:%.+]] to i8*
+  // CK1-DAG: [[CPVAL0]] = bitcast float* [[VAR0]] to i8*
+  // CK1-DAG: [[CSVAL0]] = mul nuw i[[sz]] %{{[^,]+}}, 4
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target update from(lb)
+  {++arg;}
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  {++arg;}
+
+  // Region 04
+  // CK1-DAG: call void @__tgt_target_data_update(i32 -1, i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE04]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE04]]{{.+}})
+  // CK1-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+  // CK1-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+  // CK1-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+  // CK1-DAG: store i8* bitcast ([[ST]]* @gb to i8*), i8** [[BP0]]
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[P0]]
+
+
+  // CK1-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+  // CK1-DAG: store i8* bitcast (double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1) to i8*), i8** [[BP1]]
+  // CK1-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+  // CK1-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%.+]] to i8*
+  // CK1-DAG: [[SEC1]] = getelementptr inbounds {{.+}}double* [[SEC11:%[^,]+]], i{{.+}} 0
+  // CK1-DAG: [[SEC11]] = load double*, double** getelementptr inbounds ([[ST]], [[ST]]* @gb, i32 0, i32 1),
+
+  // CK1: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK1-NOT: __tgt_target_data_end
+  #pragma omp target update to(gb.b[:3])
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2: [[ST:%.+]] = type { i32, double* }
+template <typename T>
+struct ST {
+  T a;
+  double *b;
+
+  T foo(T arg) {
+    // Region 00
+    #pragma omp target update from(b[1:3]) if(a>123) device(arg)
+    {arg++;}
+    return arg;
+  }
+};
+
+// CK2: [[SIZE00:@.+]] = {{.+}}constant [2 x i[[sz:64|32]]] [i{{64|32}} {{8|4}}, i{{64|32}} 24]
+// CK2: [[MTYPE00:@.+]] = {{.+}}constant [2 x i32] [i32 34, i32 18]
+
+// CK2-LABEL: _Z3bari
+int bar(int arg){
+  ST<int> A;
+  return A.foo(arg);
+}
+
+// Region 00
+// CK2: br i1 %{{[^,]+}}, label %[[IFTHEN:[^,]+]], label %[[IFELSE:[^,]+]]
+// CK2: [[IFTHEN]]
+// CK2-DAG: call void @__tgt_target_data_update(i32 [[DEV:%[^,]+]], i32 2, i8** [[GEPBP:%.+]], i8** [[GEPP:%.+]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[SIZE00]], {{.+}}getelementptr {{.+}}[2 x i{{.+}}]* [[MTYPE00]]{{.+}})
+// CK2-DAG: [[DEV]] = load i32, i32* %{{[^,]+}},
+// CK2-DAG: [[GEPBP]] = getelementptr inbounds {{.+}}[[BP:%[^,]+]]
+// CK2-DAG: [[GEPP]] = getelementptr inbounds {{.+}}[[P:%[^,]+]]
+
+// CK2-DAG: [[BP0:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: [[P0:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 0
+// CK2-DAG: store i8* [[CBPVAL0:%[^,]+]], i8** [[BP0]]
+// CK2-DAG: store i8* [[CPVAL0:%[^,]+]], i8** [[P0]]
+// CK2-DAG: [[CBPVAL0]] = bitcast [[ST]]* [[VAR0:%.+]] to i8*
+// CK2-DAG: [[CPVAL0]] = bitcast double** [[SEC0:%[^,]+]] to i8*
+// CK2-DAG: [[SEC0]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+
+// CK2-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BP]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[P]], i{{.+}} 0, i{{.+}} 1
+// CK2-DAG: store i8* [[CBPVAL1:%[^,]+]], i8** [[BP1]]
+// CK2-DAG: store i8* [[CPVAL1:%[^,]+]], i8** [[P1]]
+// CK2-DAG: [[CBPVAL1]] = bitcast double** [[SEC0]] to i8*
+// CK2-DAG: [[CPVAL1]] = bitcast double* [[SEC1:%[^,]+]] to i8*
+// CK2-DAG: [[SEC1]] = getelementptr inbounds {{.*}}double* [[SEC11:%[^,]+]], i{{.+}} 1
+// CK2-DAG: [[SEC11]] = load double*, double** [[SEC111:%[^,]+]],
+// CK2-DAG: [[SEC111]] = getelementptr inbounds {{.*}}[[ST]]* [[VAR0]], i32 0, i32 1
+
+// CK2: br label %[[IFEND:[^,]+]]
+
+// CK2: [[IFELSE]]
+// CK2: br label %[[IFEND]]
+// CK2: [[IFEND]]
+// CK2: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3-LABEL: no_target_devices
+void no_target_devices(int arg) {
+  // CK3-NOT: tgt_target_data_update
+  // CK3: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK3: ret
+  #pragma omp target update to(arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+///==========================================================================///
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s  --check-prefix CK4 --check-prefix CK4-32
+
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix TCK4 --check-prefix TCK4-32
+#ifdef CK4
+
+// CK4-LABEL: device_side_scan
+void device_side_scan(int arg) {
+  // CK4: tgt_target_data_update
+  // CK4: %{{.+}} = add nsw i32 %{{[^,]+}}, 1
+  // CK4: ret
+  // TCK4-NOT: tgt_target_data_update
+  #pragma omp target update from(arg) if(arg) device(4)
+  {++arg;}
+}
+#endif
+#endif
diff --git a/test/OpenMP/target_update_depend_messages.cpp b/test/OpenMP/target_update_depend_messages.cpp
new file mode 100644
index 0000000000000..fe3325d80fe47
--- /dev/null
+++ b/test/OpenMP/target_update_depend_messages.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 -o - -std=c++11 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+class vector {
+  public:
+    int operator[](int index) { return 0; }
+};
+
+template <class T, class S, class R>
+int tmain(T argc, S **argv, R *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  char *arr;
+  int i, z;
+
+  #pragma omp depend target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp depend(out:argc) target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp target depend(in:argc) update to(z) // expected-error{{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error{{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target update to(z) depend // expected-error {{expected '(' after 'depend'}}
+  #pragma omp target update to(z) depend( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend() // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  #pragma omp target update to(z) depend(source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(z) depend(out: ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(out :S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target update to(z) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : argv[0])
+  #pragma omp target update to(z) depend(in : ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(in : tmain) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  #pragma omp target update to(z) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  #pragma omp target update to(z) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  #pragma omp target update to(z) depend(in : argv[ : argc][1 : argc - 1])
+  #pragma omp target update to(z) depend(in : arr[0])
+
+  return 0;
+}
+
+int main(int argc, char **argv, char *env[]) {
+  vector vec;
+  typedef float V __attribute__((vector_size(16)));
+  V a;
+  auto arr = x; // expected-error {{use of undeclared identifier 'x'}}
+  int z;
+
+  #pragma omp depend target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp depend(out:argc) target update to(z) // expected-error{{expected an OpenMP directive}}
+  #pragma omp target depend(in:argc) update to(z) // expected-error{{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error{{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+
+  #pragma omp target update to(z) depend // expected-error {{expected '(' after 'depend'}}
+  #pragma omp target update to(z) depend( // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-error {{expected ')'}} expected-note {{to match this '('}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend() // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(argc // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(source : argc) // expected-error {{expected 'in', 'out' or 'inout' in OpenMP clause 'depend'}}
+  #pragma omp target update to(z) depend(source) // expected-error {{expected expression}} expected-warning {{missing ':' after dependency type - ignoring}}
+  #pragma omp target update to(z) depend(in : argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(z) depend(out: ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(inout : foobool(argc)), depend (in, argc) // expected-error {{expected variable name, array element or array section}} expected-warning {{missing ':' after dependency type - ignoring}} expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(out :S1) // expected-error {{'S1' does not refer to a value}}
+  #pragma omp target update to(z) depend(in : argv[1][1] = '2') // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1]) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : argv[0])
+  #pragma omp target update to(z) depend(in : ) // expected-error {{expected expression}}
+  #pragma omp target update to(z) depend(in : main) // expected-error {{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : a[0]) // expected-error{{expected variable name, array element or array section}}
+  #pragma omp target update to(z) depend(in : vec[1:2]) // expected-error {{ value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in : argv[ // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[:] // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc: // expected-error {{expected expression}} expected-error {{expected ']'}} expected-error {{expected ')'}} expected-note {{to match this '['}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[argc:argc] // expected-error {{expected ')'}} expected-note {{to match this '('}}
+  #pragma omp target update to(z) depend(in : argv[0:-1]) // expected-error {{section length is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[-1:0]) // expected-error {{section lower bound is evaluated to a negative value -1}}
+  #pragma omp target update to(z) depend(in : argv[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in : argv[3:4:1]) // expected-error {{expected ']'}} expected-note {{to match this '['}}
+  #pragma omp target update to(z) depend(in:a[0:1]) // expected-error {{subscripted value is not an array or pointer}}
+  #pragma omp target update to(z) depend(in:argv[argv[:2]:1]) // expected-error {{OpenMP array section is not allowed here}}
+  #pragma omp target update to(z) depend(in:argv[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+  #pragma omp target update to(z) depend(in:env[0:][:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is an array of unknown bound}}
+  #pragma omp target update to(z) depend(in : argv[ : argc][1 : argc - 1])
+  #pragma omp target update to(z) depend(in : arr[0])
+
+  return tmain(argc, argv, env); // expected-note {{in instantiation of function template specialization 'tmain<int, char, char>' requested here}}
+}
diff --git a/test/OpenMP/target_update_device_messages.cpp b/test/OpenMP/target_update_device_messages.cpp
new file mode 100644
index 0000000000000..37112755805cc
--- /dev/null
+++ b/test/OpenMP/target_update_device_messages.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+
+template <class T, class S>
+int tmain(T argc, S **argv) {
+  int i;
+#pragma omp target update to(i) device // expected-error {{expected '(' after 'device'}}
+#pragma omp target update to(i) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(i) device () // expected-error {{expected expression}}
+#pragma omp target update to(i) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(i) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(i) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target update from(i) device (argc + argc)
+#pragma omp target update from(i) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'device' clause}}
+#pragma omp target update from(i) device (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update from(i) device (3.14) // expected-error 2 {{expression must have integral or unscoped enumeration type, not 'double'}}
+#pragma omp target update from(i) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+}
+
+int main(int argc, char **argv) {
+  int j;
+#pragma omp target update to(j) device // expected-error {{expected '(' after 'device'}}
+#pragma omp target update from(j) device ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(j) device () // expected-error {{expected expression}}
+#pragma omp target update from(j) device (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(j) device (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(j) device (argc > 0 ? argv[1] : argv[2]) // expected-error {{expression must have integral or unscoped enumeration type, not 'char *'}}
+#pragma omp target update to(j) device (argc + argc)
+#pragma omp target update from(j) device (argc), device (argc+1) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'device' clause}}
+#pragma omp target update to(j) device (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update from(j) device (-2) // expected-error {{argument to 'device' clause must be a non-negative integer value}}
+#pragma omp target update to(j) device (3.14) // expected-error {{expression must have integral or unscoped enumeration type, not 'double'}}
+
+  return tmain(argc, argv); // expected-note {{in instantiation of function template specialization 'tmain<int, char>' requested here}}
+}
diff --git a/test/OpenMP/target_update_from_messages.cpp b/test/OpenMP/target_update_from_messages.cpp
new file mode 100644
index 0000000000000..6aff083b6a487
--- /dev/null
+++ b/test/OpenMP/target_update_from_messages.cpp
@@ -0,0 +1,176 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+struct S6 {
+  int ii;
+  int aa[30];
+  float xx;
+  double *pp;
+};
+struct S7 {
+  int i;
+  int a[50];
+  float x;
+  S6 s6[5];
+  double *p;
+  unsigned bfa : 4;
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int to;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T from;
+  const T (&l)[5] = da;
+  T *m;
+  S7 s7;
+
+#pragma omp target update from // expected-error {{expected '(' after 'from'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(x)
+#pragma omp target update from(t[:I])
+#pragma omp target update from(T) // expected-error {{'T' does not refer to a value}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(S2::S2s)
+#pragma omp target update from(S2::S2sc)
+#pragma omp target update from(from)
+#pragma omp target update from(y x) // expected-error {{expected ',' or ')' in 'from' clause}}
+#pragma omp target update from(argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+#pragma omp target update from(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update from(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(h) // expected-error {{threadprivate variables are not allowed in 'from' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(k), to(k) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update from(t), from(t[:5]) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update from(da)
+#pragma omp target update from(da[:4])
+
+#pragma omp target update from(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, (m+1)[2]) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(s7.i, s7.a[:3])
+#pragma omp target update from(s7.s6[1].aa[0:5])
+#pragma omp target update from(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(s7.p[:10])
+#pragma omp target update from(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'from' clause}}
+#pragma omp target update from(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update from(s7.x)
+  }
+
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i, t[20];
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int from;
+  const int (&l)[5] = da;
+  int *m;
+  S7 s7;
+
+#pragma omp target update from // expected-error {{expected '(' after 'from'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(x)
+#pragma omp target update from(t[:i])
+#pragma omp target update from(S2::S2s)
+#pragma omp target update from(S2::S2sc)
+#pragma omp target update from(from)
+#pragma omp target update from(y x) // expected-error {{expected ',' or ')' in 'from' clause}}
+#pragma omp target update from(argc > 0 ? x : y) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update from(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(h) // expected-error {{threadprivate variables are not allowed in 'from' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update from(k), to(k) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update from(t), from(t[:5]) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update from(da)
+#pragma omp target update from(da[:4])
+
+#pragma omp target update from(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update from(x, (m+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update from(s7.i, s7.a[:3])
+#pragma omp target update from(s7.s6[1].aa[0:5])
+#pragma omp target update from(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update from(s7.p[:10])
+#pragma omp target update from(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'from' clause}}
+#pragma omp target update from(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update from(s7.x)
+  }
+
+  return tmain<int, 3>(argc)+tmain<to, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/target_update_if_messages.cpp b/test/OpenMP/target_update_if_messages.cpp
new file mode 100644
index 0000000000000..97715e072422a
--- /dev/null
+++ b/test/OpenMP/target_update_if_messages.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note {{declared here}}
+
+template <class T, class S> // expected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int n;
+#pragma omp target update to(n) if // expected-error {{expected '(' after 'if'}}
+#pragma omp target update from(n) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if () // expected-error {{expected expression}}
+#pragma omp target update from(n) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(n) if (argc > 0 ? argv[1] : argv[2])
+#pragma omp target update to(n) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause}}
+#pragma omp target update from(n) if (S) // expected-error {{'S' does not refer to a value}}
+#pragma omp target update to(n) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(n) if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(argc)
+#pragma omp target update from(n) if(target update // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(target update : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(n) if(target update : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(n) if(target update : argc)
+#pragma omp target update from(n) if(target update : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target update'}}
+#pragma omp target update to(n) if(target update : argc) if (target update:argc) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause with 'target update' name modifier}}
+#pragma omp target update from(n) if(target update : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int m;
+#pragma omp target update to(m) if // expected-error {{expected '(' after 'if'}}
+#pragma omp target update from(m) if ( // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if () // expected-error {{expected expression}}
+#pragma omp target update from(m) if (argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+#pragma omp target update from(m) if (argc > 0 ? argv[1] : argv[2])
+#pragma omp target update to(m) if (foobool(argc)), if (true) // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause}}
+#pragma omp target update from(m) if (S1) // expected-error {{'S1' does not refer to a value}}
+#pragma omp target update to(m) if (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if (argc argc) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if (1 0) // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(if(tmain(argc, argv) // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if(target update // expected-warning {{missing ':' after directive name modifier - ignoring}} expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(target update : // expected-error {{expected expression}} expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update to(m) if(target update : argc // expected-error {{expected ')'}} expected-note {{to match this '('}}
+#pragma omp target update from(m) if(target update : argc)
+#pragma omp target update to(m) if(target update : argc) if (for:argc) // expected-error {{directive name modifier 'for' is not allowed for '#pragma omp target update'}}
+#pragma omp target update from(m) if(target update : argc) if (target update:argc)  // expected-error {{directive '#pragma omp target update' cannot contain more than one 'if' clause with 'target update' name modifier}}
+#pragma omp target update to(m) if(target update : argc) if (argc) // expected-error {{no more 'if' clause is allowed}} expected-note {{previous clause with directive name modifier specified here}}
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_update_messages.cpp b/test/OpenMP/target_update_messages.cpp
new file mode 100644
index 0000000000000..73f1eeca8efec
--- /dev/null
+++ b/test/OpenMP/target_update_messages.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // Aexpected-note {{declared here}}
+
+template <class T, class S> // Aexpected-note {{declared here}}
+int tmain(T argc, S **argv) {
+  int n;
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int m;
+  #pragma omp target update // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  #pragma omp target update to(m) { // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ( // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) [ // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ] // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(m) ) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+
+  #pragma omp target update from(m) // OK
+  {
+    foo();
+  }
+  return tmain(argc, argv);
+}
diff --git a/test/OpenMP/target_update_nowait_messages.cpp b/test/OpenMP/target_update_nowait_messages.cpp
new file mode 100644
index 0000000000000..19bc58ea467e6
--- /dev/null
+++ b/test/OpenMP/target_update_nowait_messages.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+int main(int argc, char **argv) {
+  int i;
+
+  #pragma omp nowait target update to(i) // expected-error {{expected an OpenMP directive}}
+  #pragma omp target nowait update to(i) // expected-error {{unexpected OpenMP clause 'update' in directive '#pragma omp target'}} expected-error {{unexpected OpenMP clause 'to' in directive '#pragma omp target'}}
+  {}
+  #pragma omp target update nowait() to(i) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+  #pragma omp target update to(i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait device (-10u)
+  #pragma omp target update to(i) nowait (3.14) device (-10u) // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}}
+  #pragma omp target update to(i) nowait nowait // expected-error {{directive '#pragma omp target update' cannot contain more than one 'nowait' clause}}
+  #pragma omp target update nowait to(i) nowait // expected-error {{directive '#pragma omp target update' cannot contain more than one 'nowait' clause}}
+  return 0;
+}
diff --git a/test/OpenMP/target_update_to_messages.cpp b/test/OpenMP/target_update_to_messages.cpp
new file mode 100644
index 0000000000000..641d0bd949a2c
--- /dev/null
+++ b/test/OpenMP/target_update_to_messages.cpp
@@ -0,0 +1,175 @@
+// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 100 %s
+
+void foo() {
+}
+
+bool foobool(int argc) {
+  return argc;
+}
+
+struct S1; // expected-note 2 {{declared here}}
+extern S1 a;
+class S2 {
+  mutable int a;
+public:
+  S2():a(0) { }
+  S2(S2 &s2):a(s2.a) { }
+  static float S2s; // expected-note 4 {{mappable type cannot contain static members}}
+  static const float S2sc; // expected-note 4 {{mappable type cannot contain static members}}
+};
+const float S2::S2sc = 0;
+const S2 b;
+const S2 ba[5];
+class S3 {
+  int a;
+public:
+  S3():a(0) { }
+  S3(S3 &s3):a(s3.a) { }
+};
+const S3 c;
+const S3 ca[5];
+extern const int f;
+class S4 {
+  int a;
+  S4();
+  S4(const S4 &s4);
+public:
+  S4(int v):a(v) { }
+};
+class S5 {
+  int a;
+  S5():a(0) {}
+  S5(const S5 &s5):a(s5.a) { }
+public:
+  S5(int v):a(v) { }
+};
+struct S6 {
+  int ii;
+  int aa[30];
+  float xx;
+  double *pp;
+};
+struct S7 {
+  int i;
+  int a[50];
+  float x;
+  S6 s6[5];
+  double *p;
+  unsigned bfa : 4;
+};
+
+S3 h;
+#pragma omp threadprivate(h) // expected-note 2 {{defined as threadprivate or thread local}}
+
+typedef int from;
+
+template <typename T, int I> // expected-note {{declared here}}
+T tmain(T argc) {
+  const T d = 5;
+  const T da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  T *m;
+  T i, t[20];
+  T &j = i;
+  T *k = &j;
+  T x;
+  T y;
+  T to;
+  const T (&l)[5] = da;
+  S7 s7;
+
+#pragma omp target update to // expected-error {{expected '(' after 'to'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(x)
+#pragma omp target update to(t[:I])
+#pragma omp target update to(T) // expected-error {{'T' does not refer to a value}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(I) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(S2::S2s)
+#pragma omp target update to(S2::S2sc)
+#pragma omp target update to(to)
+#pragma omp target update to(y x) // expected-error {{expected ',' or ')' in 'to' clause}}
+#pragma omp target update to(argc > 0 ? x : y) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}} 
+#pragma omp target update to(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update to(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(h) // expected-error {{threadprivate variables are not allowed in 'to' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(k), from(k) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update to(t), to(t[:5]) // expected-error 2 {{variable can appear only once in OpenMP 'target update' construct}} expected-note 2 {{used here}}
+#pragma omp target update to(da)
+#pragma omp target update to(da[:4])
+
+#pragma omp target update to(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, (m+1)[2]) // expected-error 2 {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(s7.i, s7.a[:3])
+#pragma omp target update to(s7.s6[1].aa[0:5])
+#pragma omp target update to(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(s7.p[:10])
+#pragma omp target update to(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'to' clause}}
+#pragma omp target update to(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update to(s7.x)
+  }
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  const int d = 5;
+  const int da[5] = { 0 };
+  S4 e(4);
+  S5 g(5);
+  int i, t[20];
+  int &j = i;
+  int *k = &j;
+  int x;
+  int y;
+  int to;
+  const int (&l)[5] = da;
+  S7 s7;
+  int *m;
+
+#pragma omp target update to // expected-error {{expected '(' after 'to'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to() // expected-error {{expected expression}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update() // expected-warning {{extra tokens at the end of '#pragma omp target update' are ignored}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(alloc) // expected-error {{use of undeclared identifier 'alloc'}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(x)
+#pragma omp target update to(t[:i])
+#pragma omp target update to(S2::S2s)
+#pragma omp target update to(S2::S2sc)
+#pragma omp target update to(to)
+#pragma omp target update to(y x) // expected-error {{expected ',' or ')' in 'to' clause}}
+#pragma omp target update to(argc > 0 ? x : y) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(S1) // expected-error {{'S1' does not refer to a value}}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(a, b, c, d, f) // expected-error {{incomplete type 'S1' where a complete type is required}} expected-error 2 {{type 'S2' is not mappable to target}}
+#pragma omp target update to(ba) // expected-error 2 {{type 'S2' is not mappable to target}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(h) // expected-error {{threadprivate variables are not allowed in 'to' clause}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}}
+#pragma omp target update to(k), from(k) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update to(t), to(t[:5]) // expected-error {{variable can appear only once in OpenMP 'target update' construct}} expected-note {{used here}}
+#pragma omp target update to(da)
+#pragma omp target update to(da[:4])
+
+#pragma omp target update to(x, a[:2]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, c[:]) // expected-error {{subscripted value is not an array or pointer}}
+#pragma omp target update to(x, (m+1)[2]) // expected-error {{expected expression containing only member accesses and/or array sections based on named variables}}
+#pragma omp target update to(s7.i, s7.a[:3])
+#pragma omp target update to(s7.s6[1].aa[0:5])
+#pragma omp target update to(x, s7.s6[:5].aa[6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(x, s7.s6[:5].aa[:6]) // expected-error {{OpenMP array section is not allowed here}}
+#pragma omp target update to(s7.p[:10])
+#pragma omp target update to(x, s7.bfa) // expected-error {{bit fields cannot be used to specify storage in a 'to' clause}}
+#pragma omp target update to(x, s7.p[:]) // expected-error {{section length is unspecified and cannot be inferred because subscripted value is not an array}}
+#pragma omp target data map(to: s7.i)
+  {
+#pragma omp target update to(s7.x)
+  }
+
+  return tmain<int, 3>(argc)+tmain<from, 4>(argc); // expected-note {{in instantiation of function template specialization 'tmain<int, 3>' requested here}} expected-note {{in instantiation of function template specialization 'tmain<int, 4>' requested here}}
+}
+
diff --git a/test/OpenMP/task_ast_print.cpp b/test/OpenMP/task_ast_print.cpp
index 723139b081832..37e5833dec5e5 100644
--- a/test/OpenMP/task_ast_print.cpp
+++ b/test/OpenMP/task_ast_print.cpp
@@ -8,6 +8,57 @@
 
 void foo() {}
 
+struct S1 {
+  S1(): a(0) {}
+  S1(int v) : a(v) {}
+  int a;
+  typedef int type;
+};
+
+template <typename T>
+class S7 : public T {
+protected:
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(typename T::type v) : a(v) {
+#pragma omp task private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp task private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp task private(this->a) private(this->a) private(this->S1::a)
+// CHECK: #pragma omp task private(this->a) private(this->a) private(T::a)
+// CHECK: #pragma omp task private(this->a) private(this->a)
+
+class S8 : public S7<S1> {
+  S8() {}
+
+public:
+  S8(int v) : S7<S1>(v){
+#pragma omp task private(a) private(this->a) private(S7<S1>::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S8 &operator=(S8 &s) {
+#pragma omp task private(a) private(this->a)
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
+};
+
+// CHECK: #pragma omp task private(this->a) private(this->a) private(this->S7<S1>::a)
+// CHECK: #pragma omp task private(this->a) private(this->a)
+
 template <class T>
 struct S {
   operator T() { return T(); }
@@ -98,4 +149,7 @@ int main(int argc, char **argv) {
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+extern template int S<int>::TS;
+extern template long S<long>::TS;
+
 #endif
diff --git a/test/OpenMP/task_codegen.cpp b/test/OpenMP/task_codegen.cpp
index 23dc014aad3b1..08c9ce33d8e8e 100644
--- a/test/OpenMP/task_codegen.cpp
+++ b/test/OpenMP/task_codegen.cpp
@@ -9,7 +9,7 @@
 // CHECK-DAG: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, i8* }
 // CHECK-DAG: [[STRUCT_SHAREDS:%.+]] = type { i8*, [2 x [[STRUCT_S:%.+]]]* }
 // CHECK-DAG: [[STRUCT_SHAREDS1:%.+]] = type { [2 x [[STRUCT_S:%.+]]]* }
-// CHECK-DAG: [[KMP_TASK_T:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[KMP_DEPEND_INFO:%.+]] = type { i64, i64, i8 }
 struct S {
   int a;
@@ -30,15 +30,16 @@ int main() {
 // CHECK: store i8* [[B]], i8** [[B_REF]]
 // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS]], [[STRUCT_SHAREDS]]* [[CAPTURES]], i32 0, i32 1
 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]]
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 33, i64 40, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY1:@.+]] to i32 (i32, i8*)*))
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS]]* [[CAPTURES]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 16, i32 8, i1 false)
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[PRIORITY_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 4
+// CHECK: [[PRIORITY:%.+]] = bitcast %union{{.+}}* [[PRIORITY_REF_PTR]] to i32*
+// CHECK: store i32 {{.+}}, i32* [[PRIORITY]]
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
-#pragma omp task shared(a, b, s)
+#pragma omp task shared(a, b, s) priority(b)
   {
     a = 15;
     b = a;
@@ -46,13 +47,11 @@ int main() {
   }
 // CHECK: [[S_REF:%.+]] = getelementptr inbounds [[STRUCT_SHAREDS1]], [[STRUCT_SHAREDS1]]* [[CAPTURES:%.+]], i32 0, i32 0
 // CHECK: store [2 x [[STRUCT_S]]]* [[S]], [2 x [[STRUCT_S]]]** [[S_REF]]
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 8,
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 8,
 // CHECK: [[SHAREDS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR:%.+]], i32 0, i32 0
 // CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_PTR]]
 // CHECK: [[BITCAST:%.+]] = bitcast [[STRUCT_SHAREDS1]]* [[CAPTURES]] to i8*
 // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[BITCAST]], i64 8, i32 8, i1 false)
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]], [[KMP_TASK_T]]* [[TASK_PTR]], i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
 // CHECK: [[DEP:%.*]] = getelementptr inbounds [4 x [[KMP_DEPEND_INFO]]], [4 x [[KMP_DEPEND_INFO]]]* [[DEPENDENCIES:%.*]], i64 0, i64 0
 // CHECK: [[T0:%.*]] = getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* [[DEP]], i32 0, i32 0
 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64* [[T0]]
@@ -100,17 +99,13 @@ int main() {
     a = 15;
     s[1].a = 10;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY2:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task untied
   {
     a = 1;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 32, i64 1,
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1,
 // CHECK: getelementptr inbounds [2 x [[STRUCT_S]]], [2 x [[STRUCT_S]]]* [[S]], i64 0, i64 0
 // CHECK: getelementptr inbounds [2 x [[KMP_DEPEND_INFO]]], [2 x [[KMP_DEPEND_INFO]]]* %{{[^,]+}}, i64 0, i64 0
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0
@@ -120,15 +115,15 @@ int main() {
 // CHECK: store i64 4, i64*
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 2
 // CHECK: store i8 3, i8*
+// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
+// CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
 // CHECK: [[IDX1:%.+]] = mul nsw i64 4, [[A_VAL]]
 // CHECK: [[START:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
+// CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 [[IDX2]]
 // CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
 // CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
-// CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 [[IDX2]]
 // CHECK: [[IDX1:%.+]] = mul nsw i64 9, [[A_VAL]]
 // CHECK: [[END:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[B_VAL:%.+]] = load i8, i8* [[B]]
-// CHECK: [[IDX2:%.+]] = sext i8 [[B_VAL]] to i64
 // CHECK: [[END1:%.+]] = getelementptr inbounds i32, i32* [[END]], i64 [[IDX2]]
 // CHECK: [[END2:%.+]] = getelementptr i32, i32* [[END1]], i32 1
 // CHECK: [[START_INT:%.+]] = ptrtoint i32* [[START1]] to i64
@@ -149,9 +144,7 @@ int main() {
   {
     a = 1;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 32, i64 1,
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1,
 // CHECK: getelementptr inbounds [3 x [[KMP_DEPEND_INFO]]], [3 x [[KMP_DEPEND_INFO]]]* %{{[^,]+}}, i64 0, i64 0
 // CHECK: getelementptr inbounds [[KMP_DEPEND_INFO]], [[KMP_DEPEND_INFO]]* %{{[^,]+}}, i32 0, i32 0
 // CHECK: store i64 ptrtoint (i32* @{{.+}} to i64), i64*
@@ -173,12 +166,12 @@ int main() {
 // CHECK: [[START1:%.+]] = getelementptr inbounds i32, i32* [[START]], i64 3
 // CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
 // CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
+// CHECK: [[IDX2:%.+]] = sub nsw i64 [[NEW_A_VAL_I64]], 1
+// CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
+// CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
 // CHECK: [[SUB:%.+]] = add nsw i64 -1, [[NEW_A_VAL_I64]]
 // CHECK: [[IDX1:%.+]] = mul nsw i64 [[SUB]], [[A_VAL]]
 // CHECK: [[END:%.+]] = getelementptr inbounds i32, i32* %{{.+}}, i64 [[IDX1]]
-// CHECK: [[NEW_A_VAL:%.+]] = load i32, i32* @{{.+}},
-// CHECK: [[NEW_A_VAL_I64:%.+]] = sext i32 [[NEW_A_VAL]] to i64
-// CHECK: [[IDX2:%.+]] = sub nsw i64 [[NEW_A_VAL_I64]], 1
 // CHECK: [[END1:%.+]] = getelementptr inbounds i32, i32* [[END]], i64 [[IDX2]]
 // CHECK: [[END2:%.+]] = getelementptr i32, i32* [[END1]], i32 1
 // CHECK: [[START_INT:%.+]] = ptrtoint i32* [[START1]] to i64
@@ -199,17 +192,13 @@ int main() {
   {
     a = 2;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 3, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY3:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task final(true)
   {
     a = 2;
   }
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.*}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY4:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
   const bool flag = false;
 #pragma omp task final(flag)
@@ -220,9 +209,7 @@ int main() {
 // CHECK: [[CMP:%.+]] = icmp ne i8 [[B_VAL]], 0
 // CHECK: [[FINAL:%.+]] = select i1 [[CMP]], i32 2, i32 0
 // CHECK: [[FLAGS:%.+]] = or i32 [[FINAL]], 1
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 32, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*))
-// CHECK: [[DESTRUCTORS_REF_PTR:%.+]] = getelementptr inbounds [[KMP_TASK_T]]{{.*}}* {{%.+}}, i32 0, i32 3
-// CHECK: store i32 (i32, i8*)* null, i32 (i32, i8*)** [[DESTRUCTORS_REF_PTR]]
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 [[FLAGS]], i64 40, i64 8, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY5:@.+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
   int c __attribute__((aligned(128)));
 #pragma omp task final(b) shared(c)
@@ -230,6 +217,17 @@ int main() {
     a = 4;
     c = 5;
   }
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i32 0, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_T]]{{.*}}*)* [[TASK_ENTRY6:@.+]] to i32 (i32, i8*)*))
+// CHECK: call i32 @__kmpc_omp_task([[IDENT_T]]* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
+#pragma omp task untied
+  {
+    S s1;
+#pragma omp task
+    a = 4;
+#pragma omp taskyield
+    s1 = S();
+#pragma omp taskwait
+  }
   return a;
 }
 // CHECK: define internal i32 [[TASK_ENTRY1]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
@@ -240,16 +238,41 @@ int main() {
 // CHECK: store i32 10, i32* %{{.+}}
 
 // CHECK: define internal i32 [[TASK_ENTRY2]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 1, i32* [[A_PTR:@.+]]
+// CHECK: store i32 1, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY3]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 2, i32* [[A_PTR:@.+]]
+// CHECK: store i32 2, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY4]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 3, i32* [[A_PTR:@.+]]
+// CHECK: store i32 3, i32* [[A_PTR]]
 
 // CHECK: define internal i32 [[TASK_ENTRY5]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
-// CHECK: store i32 4, i32* [[A_PTR:@.+]]
+// CHECK: store i32 4, i32* [[A_PTR]]
 // CHECK: store i32 5, i32* [[C_PTR:%.+]], align 128
+
+// CHECK: define internal i32
+// CHECK: store i32 4, i32* [[A_PTR]]
+
+// CHECK: define internal i32 [[TASK_ENTRY6]](i32, [[KMP_TASK_T]]{{.*}}* noalias)
+// CHECK: switch i32 %{{.+}}, label
+// CHECK: load i32*, i32** %
+// CHECK: store i32 1, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i8* @__kmpc_omp_task_alloc(
+// CHECK: call i32 @__kmpc_omp_task(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 2, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i32 @__kmpc_omp_taskyield(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 3, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
+
+// CHECK: call i32 @__kmpc_omp_taskwait(%
+// CHECK: load i32*, i32** %
+// CHECK: store i32 4, i32* %
+// CHECK: call i32 @__kmpc_omp_task(%
 #endif
 
diff --git a/test/OpenMP/task_firstprivate_codegen.cpp b/test/OpenMP/task_firstprivate_codegen.cpp
index e2244140d1e14..0d8e1c4afe4e5 100644
--- a/test/OpenMP/task_firstprivate_codegen.cpp
+++ b/test/OpenMP/task_firstprivate_codegen.cpp
@@ -24,10 +24,10 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
-// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
 // CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
 // CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
 // CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
@@ -58,18 +58,16 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
 // LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
-// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
 // LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
 
 // LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
 // LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_ADDR_REF]]
-// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
 // LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
 
 // LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
@@ -104,18 +102,16 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
   // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[G_REF:%.+]] = load double*, double** [[G_ADDR_REF]]
-  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_REF]]
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
   // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
 
   // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
   // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_ADDR_REF]]
-  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_REF]]
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
   // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
   // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
   // BLOCKS: ret
@@ -180,20 +176,22 @@ int main() {
 // CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
 // CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
 // CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
-// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
-// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[T_VAR:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
 // CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
-// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
 // CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
-// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
-// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
 
 // Allocate task.
 // Returns struct kmp_task_t {
 //         [[KMP_TASK_T]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 80, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
@@ -211,7 +209,7 @@ int main() {
 // Constructors for s_arr and var.
 // s_arr;
 // CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
 // CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
 // CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
 // CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
@@ -221,14 +219,13 @@ int main() {
 
 // var;
 // CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
-// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
 // CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
 // CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
 
 // t_var;
 // CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
-// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
-// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
 // CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]],
 // CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
 
@@ -239,14 +236,14 @@ int main() {
 
 // sivar;
 // CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
-// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
-// CHECK: [[SIVAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[SIVAR_ADDR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
 // CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_REF]],
 // CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -347,7 +344,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // Fill kmp_task_t->shareds by copying from original capture argument.
@@ -391,7 +388,8 @@ int main() {
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -422,11 +420,11 @@ int main() {
 // CHECK: ret void
 
 // CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
-
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
 // CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
 // CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
diff --git a/test/OpenMP/task_firstprivate_messages.cpp b/test/OpenMP/task_firstprivate_messages.cpp
index ef5f3856430a3..11d8c5789879a 100644
--- a/test/OpenMP/task_firstprivate_messages.cpp
+++ b/test/OpenMP/task_firstprivate_messages.cpp
@@ -7,6 +7,17 @@ bool foobool(int argc) {
   return argc;
 }
 
+template <typename T>
+struct S {
+  T b;
+  S(T a, T c) {
+#pragma omp task default(none) firstprivate(a, b)
+    a = b = c; // expected-error {{variable 'c' must have explicitly specified data sharing attributes}}
+  }
+};
+
+S<int> s(3, 4); // expected-note {{in instantiation of member function 'S<int>::S' requested here}}
+
 struct S1; // expected-note {{declared here}} expected-note{{forward declaration of 'S1'}}
 extern S1 a;
 class S2 {
diff --git a/test/OpenMP/task_if_codegen.cpp b/test/OpenMP/task_if_codegen.cpp
index 5992be02a84a9..4226dceda86e9 100644
--- a/test/OpenMP/task_if_codegen.cpp
+++ b/test/OpenMP/task_if_codegen.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix=CHECK %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
 // REQUIRES: x86-registered-target
 // expected-no-diagnostics
 #ifndef HEADER
@@ -63,11 +63,11 @@ int tmain(T Arg) {
 // CHECK-LABEL: @main
 int main() {
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN7:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN7:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 #pragma omp task if (true)
   fn7();
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN8:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN8:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: call void @__kmpc_omp_task_begin_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
 // CHECK: call i32 [[CAP_FN8]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
@@ -75,7 +75,7 @@ int main() {
 #pragma omp task if (false)
   fn8();
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN9:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN9:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -89,7 +89,7 @@ int main() {
 // CHECK: [[OMP_END]]
 #pragma omp task if (Arg)
   fn9();
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN10:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc({{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN10:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -126,7 +126,7 @@ int main() {
 
 // CHECK-LABEL: define {{.+}} @{{.+}}tmain
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32,  %{{[^*]+}}*)* [[CAP_FN1:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32,  %{{[^*]+}}*)* [[CAP_FN1:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 [[GTID]], i8* [[ORIG_TASK_PTR]])
 
 // CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(
@@ -135,7 +135,7 @@ int main() {
 // CHECK: call i32 [[CAP_FN2:@.+]](i32 [[GTID]], %{{.+}}* [[TASK_PTR]])
 // CHECK: call void @__kmpc_omp_task_complete_if0(%{{.+}}* @{{.+}}, i{{.+}} [[GTID]], i8* [[ORIG_TASK_PTR]])
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN3:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN3:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -148,7 +148,7 @@ int main() {
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN4:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN4:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -162,7 +162,7 @@ int main() {
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN5:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN5:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
@@ -176,7 +176,7 @@ int main() {
 // CHECK: br label %[[OMP_END]]
 // CHECK: [[OMP_END]]
 
-// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 32, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN6:[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[ORIG_TASK_PTR:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^,]+}}, i32 [[GTID]], i32 1, i64 40, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[CAP_FN6:[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[TASK_PTR:%.+]] = bitcast i8* [[ORIG_TASK_PTR]] to
 // CHECK: br i1 %{{.+}}, label %[[OMP_THEN:.+]], label %[[OMP_ELSE:.+]]
 // CHECK: [[OMP_THEN]]
diff --git a/test/OpenMP/task_messages.cpp b/test/OpenMP/task_messages.cpp
index 64bf8a40f025d..f42a37ae1daad 100644
--- a/test/OpenMP/task_messages.cpp
+++ b/test/OpenMP/task_messages.cpp
@@ -60,11 +60,10 @@ int foo() {
 // expected-error@+1 2 {{calling a private constructor of class 'S'}}
 #pragma omp parallel shared(a, b)
   ++a, ++b;
-// expected-note@+1 3 {{defined as reduction}}
+// expected-note@+1 2 {{defined as reduction}}
 #pragma omp parallel reduction(+ : r)
-// expected-error@+1 {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+// expected-error@+1 2 {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
 #pragma omp task firstprivate(r)
-  // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
   ++r;
 // expected-note@+1 2 {{defined as reduction}}
 #pragma omp parallel reduction(+ : r)
@@ -77,12 +76,11 @@ int foo() {
   // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
   ++r;
 #pragma omp parallel
-// expected-note@+1 3 {{defined as reduction}}
+// expected-note@+1 2 {{defined as reduction}}
 #pragma omp for reduction(+ : r)
   for (int i = 0; i < 10; ++i)
-// expected-error@+1 {{argument of a reduction clause of a for construct must not appear in a firstprivate clause on a task construct}}
+// expected-error@+1 2 {{argument of a reduction clause of a for construct must not appear in a firstprivate clause on a task construct}}
 #pragma omp task firstprivate(r)
-    // expected-error@+1 2 {{reduction variables may not be accessed in an explicit task}}
     ++r;
 #pragma omp parallel
 // expected-note@+1 2 {{defined as reduction}}
diff --git a/test/OpenMP/task_private_codegen.cpp b/test/OpenMP/task_private_codegen.cpp
index 1455fd11a91d9..97155a73f1bd0 100644
--- a/test/OpenMP/task_private_codegen.cpp
+++ b/test/OpenMP/task_private_codegen.cpp
@@ -24,7 +24,7 @@ struct S {
 
 volatile double g;
 
-// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, i32 (i32, i8*)* }
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}} }
 // CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
 // CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
 // CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
@@ -56,10 +56,8 @@ int main() {
   // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
   [&]() {
   // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
-  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
 // LAMBDA: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
 // LAMBDA: ret
 #pragma omp task private(g, sivar)
@@ -94,10 +92,8 @@ int main() {
   // BLOCKS: call void {{%.+}}(i8
   ^{
   // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
-  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 48, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 56, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
   // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
-  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
-  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
   // BLOCKS: call i32 @__kmpc_omp_task(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]])
   // BLOCKS: ret
 #pragma omp task private(g, sivar)
@@ -162,7 +158,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_MAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 80, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -186,7 +182,8 @@ int main() {
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -275,7 +272,7 @@ int main() {
 //         [[KMP_TASK_T_TY]] task_data;
 //         [[KMP_TASK_TMAIN_TY]] privates;
 //       };
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 1, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
 // CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
 
 // CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
@@ -299,7 +296,8 @@ int main() {
 
 // Provide pointer to destructor function, which will destroy private variables at the end of the task.
 // CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
-// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_REF]],
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
 
 // Start task.
 // CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8* [[RES]])
@@ -331,10 +329,11 @@ int main() {
 
 // CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
 
-// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
-// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
-// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
-// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
 // CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
 // CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
 // CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
diff --git a/test/OpenMP/taskgroup_codegen.cpp b/test/OpenMP/taskgroup_codegen.cpp
index d1bc2aafc7b4b..0f6e81b3ba0b7 100644
--- a/test/OpenMP/taskgroup_codegen.cpp
+++ b/test/OpenMP/taskgroup_codegen.cpp
@@ -32,6 +32,7 @@ int main() {
   foo();
 // CHECK-NOT:   call {{.*}}void @__kmpc_taskgroup
 // CHECK-NOT:   call {{.*}}void @__kmpc_end_taskgroup
+// CHECK:       ret
   return a;
 }
 
diff --git a/test/OpenMP/taskloop_codegen.cpp b/test/OpenMP/taskloop_codegen.cpp
new file mode 100644
index 0000000000000..e585fcec10d5d
--- /dev/null
+++ b/test/OpenMP/taskloop_codegen.cpp
@@ -0,0 +1,192 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+#pragma omp taskloop priority(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+#pragma omp taskloop nogroup grainsize(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// CHECK: [[IF_INT:%.+]] = sext i1 [[IF]] to i32
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+  int i;
+#pragma omp taskloop if(argc) shared(argc, argv) collapse(2) num_tasks(4)
+  for (i = 0; i < argc; ++i)
+  for (int j = argc; j < argv[argc][argc]; ++j)
+    ;
+}
+
+// CHECK: define internal i32 [[TASK1]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK2]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK3]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: store i64 [[LB_VAL]], i64* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: ret i32 0
+
+// CHECK-LABEL: @_ZN1SC2Ei
+struct S {
+  int a;
+  S(int c) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[NUM_TASKS:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 [[NUM_TASKS]], i8* null)
+#pragma omp taskloop shared(c) num_tasks(a)
+    for (a = 0; a < c; ++a)
+      ;
+  }
+} s(1);
+
+// CHECK: define internal i32 [[TASK4]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK: store i32 %
+// CHECK: load i32, i32* %
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label %
+// CHECK: ret i32 0
+
+#endif
diff --git a/test/OpenMP/taskloop_collapse_messages.cpp b/test/OpenMP/taskloop_collapse_messages.cpp
index f33da11f5ed96..1a5620e77db0a 100644
--- a/test/OpenMP/taskloop_collapse_messages.cpp
+++ b/test/OpenMP/taskloop_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp taskloop', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop', but found only 1}}
   #pragma omp taskloop collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop', but found only 1}}
-  #pragma omp taskloop collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp taskloop collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop' must be a for loop}}
diff --git a/test/OpenMP/taskloop_firstprivate_codegen.cpp b/test/OpenMP/taskloop_firstprivate_codegen.cpp
new file mode 100644
index 0000000000000..822a5c687dc76
--- /dev/null
+++ b/test/OpenMP/taskloop_firstprivate_codegen.cpp
@@ -0,0 +1,511 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test(ttt);
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop firstprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test(ttt);
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop firstprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR_VAL]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_MAIN_TY]]*
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_ADDR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// sivar;
+// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
+// CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_ADDR_REF]],
+// CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_TMAIN_TY]]*
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]], align 128
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop firstprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_firstprivate_messages.cpp b/test/OpenMP/taskloop_firstprivate_messages.cpp
index e2e87e4697a5d..fe22311f650cd 100644
--- a/test/OpenMP/taskloop_firstprivate_messages.cpp
+++ b/test/OpenMP/taskloop_firstprivate_messages.cpp
@@ -295,9 +295,9 @@ int main(int argc, char **argv) {
 #pragma omp taskloop firstprivate(i) // expected-note {{defined as firstprivate}}
   for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop' directive may not be firstprivate, predetermined as private}}
     foo();
-#pragma omp parallel reduction(+ : i)
-#pragma omp taskloop firstprivate(i) // expected-note {{defined as firstprivate}}
-  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop' directive may not be firstprivate, predetermined as private}}
+#pragma omp parallel reduction(+ : i) // expected-note 4 {{defined as reduction}}
+#pragma omp taskloop firstprivate(i) //expected-error {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+  for (i = 0; i < argc; ++i) // expected-error 3 {{reduction variables may not be accessed in an explicit task}}
     foo();
 #pragma omp parallel
 #pragma omp taskloop firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
diff --git a/test/OpenMP/taskloop_lastprivate_codegen.cpp b/test/OpenMP/taskloop_lastprivate_codegen.cpp
new file mode 100644
index 0000000000000..8414b6f963f98
--- /dev/null
+++ b/test/OpenMP/taskloop_lastprivate_codegen.cpp
@@ -0,0 +1,519 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// LAMBDA: ret
+#pragma omp taskloop lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: icmp ne i32 %{{.+}}, 0
+    // LAMBDA: br i1
+    // LAMBDA: load double, double* %
+    // LAMBDA: store volatile double %
+    // LAMBDA: load i32, i32* %
+    // LAMBDA: store i32 %
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+  // BLOCKS: ret
+#pragma omp taskloop lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: icmp ne i32 %{{.+}}, 0
+    // BLOCKS: br i1
+    // BLOCKS: load double, double* %
+    // BLOCKS: store volatile double %
+    // BLOCKS: load i32, i32* %
+    // BLOCKS: store i32 %
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop lastprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+
+// t_var;
+// vec;
+// sivar;
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// t_var;
+// vec;
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+// ARRAY: icmp ne i32 %{{.+}}, 0
+// ARRAY: store float* %{{.+}}, float** %{{.+}},
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
+#pragma omp taskloop lastprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_loop_messages.cpp b/test/OpenMP/taskloop_loop_messages.cpp
index 02518e572f3af..291cbdbe718c9 100644
--- a/test/OpenMP/taskloop_loop_messages.cpp
+++ b/test/OpenMP/taskloop_loop_messages.cpp
@@ -427,12 +427,12 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -483,7 +483,7 @@ int test_with_random_access_iterator() {
 #pragma omp taskloop
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp taskloop
diff --git a/test/OpenMP/taskloop_private_codegen.cpp b/test/OpenMP/taskloop_private_codegen.cpp
new file mode 100644
index 0000000000000..38b20c579393b
--- /dev/null
+++ b/test/OpenMP/taskloop_private_codegen.cpp
@@ -0,0 +1,420 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32. Investigating.
+// REQUIRES: shell
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+    // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+    // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SIVAR_REF]]
+
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 3;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 3;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 3, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 4;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop private(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 8;
+  }
+#pragma omp task
+  g+=1;
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_MAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8*
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK_DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_TMAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop private(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_private_messages.cpp b/test/OpenMP/taskloop_private_messages.cpp
index 3d00d3f252b38..367d59da69597 100644
--- a/test/OpenMP/taskloop_private_messages.cpp
+++ b/test/OpenMP/taskloop_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp taskloop private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp taskloop private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp taskloop private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp taskloop private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp taskloop private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@ int main(int argc, char **argv) {
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/taskloop_simd_aligned_messages.cpp b/test/OpenMP/taskloop_simd_aligned_messages.cpp
index b62af044c60f1..b45f44fe1cf8b 100644
--- a/test/OpenMP/taskloop_simd_aligned_messages.cpp
+++ b/test/OpenMP/taskloop_simd_aligned_messages.cpp
@@ -196,6 +196,7 @@ int main(int argc, char **argv) {
   #pragma omp taskloop simd aligned(h)
   for (int k = 0; k < argc; ++k) ++k;
   int *pargc = &argc;
+  // expected-note@+1 {{in instantiation of function template specialization 'foomain<int *, char>' requested here}}
   foomain<int*,char>(pargc,argv);
   return 0;
 }
diff --git a/test/OpenMP/taskloop_simd_codegen.cpp b/test/OpenMP/taskloop_simd_codegen.cpp
new file mode 100644
index 0000000000000..dc60009ff2bd1
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_codegen.cpp
@@ -0,0 +1,203 @@
+// RUN: %clang_cc1 -verify -triple x86_64-apple-darwin10 -fopenmp -x c++ -emit-llvm %s -o - -femit-all-decls | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -include-pch %t -verify %s -emit-llvm -o - -femit-all-decls | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+#ifndef HEADER
+#define HEADER
+
+// CHECK-LABEL: @main
+int main(int argc, char **argv) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 33, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK1:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 0, i64 0, i8* null)
+#pragma omp taskloop simd priority(argc)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK2:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 9, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[GRAINSIZE:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 1, i32 1, i64 [[GRAINSIZE]], i8* null)
+#pragma omp taskloop simd nogroup grainsize(argc) simdlen(4)
+  for (int i = 0; i < 10; ++i)
+    ;
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 24, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK3:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[IF:%.+]] = icmp ne i32 %{{.+}}, 0
+// CHECK: [[IF_INT:%.+]] = sext i1 [[IF]] to i32
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 [[IF_INT]], i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 4, i8* null)
+  int i;
+#pragma omp taskloop simd if(argc) shared(argc, argv) collapse(2) num_tasks(4) safelen(32)
+  for (i = 0; i < argc; ++i)
+  for (int j = argc; j < argv[argc][argc]; ++j)
+    ;
+}
+
+// CHECK: define internal i32 [[TASK1]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP1:!.+]]
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: store i32 %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP1]]
+// CHECK: br label %{{.*}}!llvm.loop [[LOOP1]]
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK2]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP2:!.+]]
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: store i32 %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: load i32, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %{{.*}}!llvm.mem.parallel_loop_access [[LOOP2]]
+// CHECK: br label %{{.*}}!llvm.loop [[LOOP2]]
+// CHECK: ret i32 0
+
+// CHECK: define internal i32 [[TASK3]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: store i64 [[LB_VAL]], i64* [[CNT:%.+]],
+// CHECK: br label
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: br label %{{.*}}!llvm.loop
+// CHECK: ret i32 0
+
+// CHECK-LABEL: @_ZN1SC2Ei
+struct S {
+  int a;
+  S(int c) {
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEFLOC:@.+]])
+// CHECK: [[TASKV:%.+]] = call i8* @__kmpc_omp_task_alloc(%ident_t* [[DEFLOC]], i32 [[GTID]], i32 1, i64 72, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, [[TDP_TY:%.+]]*)* [[TASK4:@.+]] to i32 (i32, i8*)*))
+// CHECK: [[TASK:%.+]] = bitcast i8* [[TASKV]] to [[TDP_TY]]*
+// CHECK: [[TASK_DATA:%.+]] = getelementptr inbounds [[TDP_TY]], [[TDP_TY]]* [[TASK]], i32 0, i32 0
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 5
+// CHECK: store i64 0, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 6
+// CHECK: store i64 %{{.+}}, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* [[TASK_DATA]], i32 0, i32 7
+// CHECK: store i64 1, i64* [[ST]],
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[NUM_TASKS:%.+]] = zext i32 %{{.+}} to i64
+// CHECK: call void @__kmpc_taskloop(%ident_t* [[DEFLOC]], i32 [[GTID]], i8* [[TASKV]], i32 1, i64* [[DOWN]], i64* [[UP]], i64 [[ST_VAL]], i32 0, i32 2, i64 [[NUM_TASKS]], i8* null)
+#pragma omp taskloop simd shared(c) num_tasks(a) simdlen(64) safelen(8)
+    for (a = 0; a < c; ++a)
+      ;
+  }
+} s(1);
+
+// CHECK: define internal i32 [[TASK4]](
+// CHECK: [[DOWN:%.+]] = getelementptr inbounds [[TD_TY:%.+]], [[TD_TY]]* %{{.+}}, i32 0, i32 5
+// CHECK: [[DOWN_VAL:%.+]] = load i64, i64* [[DOWN]],
+// CHECK: [[UP:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 6
+// CHECK: [[UP_VAL:%.+]] = load i64, i64* [[UP]],
+// CHECK: [[ST:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 7
+// CHECK: [[ST_VAL:%.+]] = load i64, i64* [[ST]],
+// CHECK: [[LITER:%.+]] = getelementptr inbounds [[TD_TY]], [[TD_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: [[LITER_VAL:%.+]] = load i32, i32* [[LITER]],
+// CHECK: store i64 [[DOWN_VAL]], i64* [[LB:%[^,]+]],
+// CHECK: store i64 [[UP_VAL]], i64* [[UB:%[^,]+]],
+// CHECK: store i64 [[ST_VAL]], i64* [[ST:%[^,]+]],
+// CHECK: store i32 [[LITER_VAL]], i32* [[LITER:%[^,]+]],
+// CHECK: [[LB_VAL:%.+]] = load i64, i64* [[LB]],
+// CHECK: [[LB_I32:%.+]] = trunc i64 [[LB_VAL]] to i32
+// CHECK: store i32 [[LB_I32]], i32* [[CNT:%.+]],
+// CHECK: br label
+// CHECK: [[VAL:%.+]] = load i32, i32* [[CNT]],
+// CHECK: [[VAL_I64:%.+]] = sext i32 [[VAL]] to i64
+// CHECK: [[UB_VAL:%.+]] = load i64, i64* [[UB]],
+// CHECK: [[CMP:%.+]] = icmp ule i64 [[VAL_I64]], [[UB_VAL]]
+// CHECK: br i1 [[CMP]], label %{{.+}}, label %{{.+}}
+// CHECK: load i32, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: store i32 %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: load i32, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: add nsw i32 %{{.+}}, 1
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK-NOT: !llvm.mem.parallel_loop_access
+// CHECK: br label %{{.*}}!llvm.loop
+// CHECK: ret i32 0
+
+// CHECK: !{!"llvm.loop.vectorize.enable", i1 true}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 4}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 32}
+// CHECK: !{!"llvm.loop.vectorize.width", i32 64}
+
+#endif
diff --git a/test/OpenMP/taskloop_simd_collapse_messages.cpp b/test/OpenMP/taskloop_simd_collapse_messages.cpp
index d178c0834d626..e4ce0c1df41ab 100644
--- a/test/OpenMP/taskloop_simd_collapse_messages.cpp
+++ b/test/OpenMP/taskloop_simd_collapse_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse ((ST > 0) ? 1 + ST : 2) // expected-note 2 {{as specified in 'collapse' clause}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST]; // expected-error 2 {{expected 2 for loops after '#pragma omp taskloop simd', but found only 1}}
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
-  // expected-error@+2 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
+  // expected-error@+5 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd collapse (foobool(argc)), collapse (true), collapse (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd collapse (1)
@@ -59,16 +71,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop simd', but found only 1}}
   #pragma omp taskloop simd collapse (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}  expected-note {{as specified in 'collapse' clause}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4]; // expected-error {{expected 4 for loops after '#pragma omp taskloop simd', but found only 1}}
-  #pragma omp taskloop simd collapse (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd collapse (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'collapse' clause}}
   // expected-error@+1 2 {{argument to 'collapse' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd collapse (foobool(argc)), collapse (true), collapse (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd collapse (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd collapse (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
new file mode 100644
index 0000000000000..0b87ddd9acb12
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_firstprivate_codegen.cpp
@@ -0,0 +1,511 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type {{.*}}{ [2 x i32]*, i32, {{.*}}[2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test(ttt);
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd firstprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+// LAMBDA: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+// LAMBDA: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+// LAMBDA: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+// LAMBDA: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop simd firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[G_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 0
+  // BLOCKS: [[G_VAL:%.+]] = load volatile double, double* [[G_ADDR_REF]]
+  // BLOCKS: store volatile double [[G_VAL]], double* [[G_PRIVATE_ADDR]]
+
+  // BLOCKS: [[SIVAR_PRIVATE_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: [[SIVAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[SIVAR_ADDR_REF]]
+  // BLOCKS: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_PRIVATE_ADDR]]
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop simd firstprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test(ttt);
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd firstprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[T_VAR_VAL:%.+]] = load i32, i32* [[T_VAR_ADDR]],
+// CHECK: store i32 [[T_VAR_VAL]], i32* [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 5
+// CHECK: [[SIVAR_VAL:%.+]] = load i32, i32* [[SIVAR]],
+// CHECK: store i{{[0-9]+}} [[SIVAR_VAL]], i{{[0-9]+}}* [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_MAIN_TY]]*
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[S_ARR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[VAR_REF:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[VAR_ADDR_REF]],
+// CHECK: call void [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]], [[S_DOUBLE_TY]]* {{.*}}[[VAR_REF]],
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_ADDR_REF]],
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]],
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// sivar;
+// CHECK: [[PRIVATE_SIVAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 4
+// CHECK: [[SIVAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 5
+// CHECK: [[SIVAR:%.+]] = load i{{.+}}, i{{.+}}* [[SIVAR_ADDR_REF]],
+// CHECK: store i32 [[SIVAR]], i32* [[PRIVATE_SIVAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_COPY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]],
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[SHAREDS:%.+]] = bitcast i8* [[SHAREDS_REF]] to [[CAP_TMAIN_TY]]*
+
+// t_var;
+// CHECK: [[PRIVATE_T_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[T_VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[T_VAR_REF:%.+]] = load i{{.+}}*, i{{.+}}** [[T_VAR_ADDR_REF]],
+// CHECK: [[T_VAR:%.+]] = load i{{.+}}, i{{.+}}* [[T_VAR_REF]], align 128
+// CHECK: store i32 [[T_VAR]], i32* [[PRIVATE_T_VAR_REF]], align 128
+
+// vec;
+// CHECK: [[PRIVATE_VEC_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: [[VEC_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 0
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[S_ARR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]],
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[VAR_ADDR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* [[SHAREDS]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]],
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br i1 %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i32 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop simd firstprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_firstprivate_messages.cpp b/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
index 83946695204b8..18cefc1beeb25 100644
--- a/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
+++ b/test/OpenMP/taskloop_simd_firstprivate_messages.cpp
@@ -295,9 +295,9 @@ int main(int argc, char **argv) {
 #pragma omp taskloop simd firstprivate(i) // expected-note {{defined as firstprivate}}
   for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop simd' directive may not be firstprivate, predetermined as linear}}
     foo();
-#pragma omp parallel reduction(+ : i)
-#pragma omp taskloop simd firstprivate(i) // expected-note {{defined as firstprivate}}
-  for (i = 0; i < argc; ++i) // expected-error {{loop iteration variable in the associated loop of 'omp taskloop simd' directive may not be firstprivate, predetermined as linear}}
+#pragma omp parallel reduction(+ : i) // expected-note 4 {{defined as reduction}}
+#pragma omp taskloop simd firstprivate(i) // expected-error {{argument of a reduction clause of a parallel construct must not appear in a firstprivate clause on a task construct}}
+  for (i = 0; i < argc; ++i) // expected-error 3 {{reduction variables may not be accessed in an explicit task}}
     foo();
 #pragma omp parallel
 #pragma omp taskloop simd firstprivate(B::x) // expected-error {{threadprivate or thread local variable cannot be firstprivate}}
diff --git a/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
new file mode 100644
index 0000000000000..e3562a8f0c62d
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_lastprivate_codegen.cpp
@@ -0,0 +1,519 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32.
+// REQUIRES: shell
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  S(const S &s, T t = T()) : f(s.f + t) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_DOUBLE_TY]]]*, [[S_DOUBLE_TY]]*, i{{[0-9]+}}* }
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]* }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> ttt;
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd lastprivate(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA: [[SIVAR:@.+]] = internal global i{{[0-9]+}} 0,
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// LAMBDA: ret
+#pragma omp taskloop simd lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+
+    // LAMBDA: store double* %{{.+}}, double** %{{.+}},
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: icmp ne i32 %{{.+}}, 0
+    // LAMBDA: br i1
+    // LAMBDA: load double, double* %
+    // LAMBDA: store volatile double %
+    // LAMBDA: load i32, i32* %
+    // LAMBDA: store i32 %
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 16, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY:%[^*]+]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+  // BLOCKS: ret
+#pragma omp taskloop simd lastprivate(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[ISVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 22, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: store double* %{{.+}}, double** %{{.+}},
+    // BLOCKS: store i{{[0-9]+}}* %{{.+}}, i{{[0-9]+}}** %{{.+}},
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 11;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 11, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    // BLOCKS: icmp ne i32 %{{.+}}, 0
+    // BLOCKS: br i1
+    // BLOCKS: load double, double* %
+    // BLOCKS: store volatile double %
+    // BLOCKS: load i32, i32* %
+    // BLOCKS: store i32 %
+    ^{
+      g = 2;
+      sivar = 22;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> ttt;
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd lastprivate(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 33;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: [[SIVAR:.+]] = internal global i{{[0-9]+}} 0,
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: alloca [[S_DOUBLE_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[S_ARR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_DOUBLE_TY]]* [[VAR_ADDR]], [[S_DOUBLE_TY]]** [[VAR_REF]],
+// CHECK: [[SIVAR_REF:%.+]] = getelementptr inbounds [[CAP_MAIN_TY]], [[CAP_MAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 4
+// CHECK: store i{{[0-9]+}}* [[SIVAR]], i{{[0-9]+}}** [[SIVAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 40, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_MAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 40, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// s_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+
+// t_var;
+// vec;
+// sivar;
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: [[PRIV_SIVAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 4
+// CHECK: [[ARG5:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** %{{.+}},
+// CHECK: store i{{[0-9]+}}* [[PRIV_SIVAR]], i{{[0-9]+}}** [[ARG5]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK-DAG: [[PRIV_SIVAR]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_DOUBLE_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: alloca [[S_INT_TY]],
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32, align 128
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Store original variables in capture struct.
+// CHECK: [[VEC_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: store [2 x i32]* [[VEC_ADDR]], [2 x i32]** [[VEC_REF]],
+// CHECK: [[T_VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: store i32* [[T_VAR_ADDR]], i32** [[T_VAR_REF]],
+// CHECK: [[S_ARR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: store [2 x [[S_INT_TY]]]* [[S_ARR_ADDR]], [2 x [[S_INT_TY]]]** [[S_ARR_REF]],
+// CHECK: [[VAR_REF:%.+]] = getelementptr inbounds [[CAP_TMAIN_TY]], [[CAP_TMAIN_TY]]* %{{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+// CHECK: store [[S_INT_TY]]* [[VAR_ADDR]], [[S_INT_TY]]** [[VAR_REF]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 32, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// Fill kmp_task_t->shareds by copying from original capture argument.
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF_ADDR:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[SHAREDS_REF:%.+]] = load i8*, i8** [[SHAREDS_REF_ADDR]],
+// CHECK: [[CAPTURES_ADDR:%.+]] = bitcast [[CAP_TMAIN_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[SHAREDS_REF]], i8* [[CAPTURES_ADDR]], i64 32, i32 8, i1 false)
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// t_var;
+// vec;
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%[^,]+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK:     icmp ne i32 %{{.+}}, 0
+// CHECK-NEXT: br i1
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: bitcast [2 x i32]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK-NEXT: br i1
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: bitcast [[S_INT_TY]]* %{{.+}} to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %
+// CHECK: br label
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* %{{.+}}, i32 0, i32 8
+// CHECK: load i32, i32* %
+// CHECK: store i32 %{{.+}}, i32* %
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) {}
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+// ARRAY: icmp ne i32 %{{.+}}, 0
+// ARRAY: store float* %{{.+}}, float** %{{.+}},
+// ARRAY: store %struct.St* %{{.+}}, %struct.St** %{{.+}},
+#pragma omp taskloop simd lastprivate(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_loop_messages.cpp b/test/OpenMP/taskloop_simd_loop_messages.cpp
index 47318721102fd..3326e6ff61cd8 100644
--- a/test/OpenMP/taskloop_simd_loop_messages.cpp
+++ b/test/OpenMP/taskloop_simd_loop_messages.cpp
@@ -428,12 +428,12 @@ public:
   typedef int difference_type;
   typedef std::random_access_iterator_tag iterator_category;
 };
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'GoodIter' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 int operator-(GoodIter a, GoodIter b) { return 0; }
 // expected-note@+1 3 {{candidate function not viable: requires single argument 'a', but 2 arguments were provided}}
 GoodIter operator-(GoodIter a) { return a; }
-// expected-note@+2 {{candidate function not viable: no known conversion from 'Iter0' to 'int' for 2nd argument}}
+// expected-note@+2 {{candidate function not viable: no known conversion from 'const Iter0' to 'int' for 2nd argument}}
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter1' to 'GoodIter' for 1st argument}}
 GoodIter operator-(GoodIter a, int v) { return GoodIter(); }
 // expected-note@+1 2 {{candidate function not viable: no known conversion from 'Iter0' to 'GoodIter' for 1st argument}}
@@ -484,7 +484,7 @@ int test_with_random_access_iterator() {
 #pragma omp taskloop simd
   for (begin = GoodIter(0); begin < end; ++begin)
     ++begin;
-// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'Iter0')}}
+// expected-error@+4 {{invalid operands to binary expression ('GoodIter' and 'const Iter0')}}
 // expected-error@+3 {{could not calculate number of iterations calling 'operator-' with upper and lower loop bounds}}
 #pragma omp parallel
 #pragma omp taskloop simd
diff --git a/test/OpenMP/taskloop_simd_private_codegen.cpp b/test/OpenMP/taskloop_simd_private_codegen.cpp
new file mode 100644
index 0000000000000..557601e9c93e4
--- /dev/null
+++ b/test/OpenMP/taskloop_simd_private_codegen.cpp
@@ -0,0 +1,420 @@
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DARRAY -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=ARRAY %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+// It doesn't pass on win32. Investigating.
+// REQUIRES: shell
+
+#ifndef ARRAY
+#ifndef HEADER
+#define HEADER
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile double g;
+
+// CHECK-DAG: [[KMP_TASK_T_TY:%.+]] = type { i8*, i32 (i32, i8*)*, i32, %union{{.+}}, %union{{.+}}, i64, i64, i64, i32 }
+// CHECK-DAG: [[S_DOUBLE_TY:%.+]] = type { double }
+// CHECK-DAG: [[CAP_MAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_MAIN_TY:%.+]] = type {{.?}}{ [2 x [[S_DOUBLE_TY]]], [[S_DOUBLE_TY]], i32, [2 x i32]
+// CHECK-DAG: [[KMP_TASK_MAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [[PRIVATES_MAIN_TY]] }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i32 }
+// CHECK-DAG: [[CAP_TMAIN_TY:%.+]] = type { i8 }
+// CHECK-DAG: [[PRIVATES_TMAIN_TY:%.+]] = type { i32, [2 x i32], [2 x [[S_INT_TY]]], [[S_INT_TY]], [104 x i8] }
+// CHECK-DAG: [[KMP_TASK_TMAIN_TY:%.+]] = type { [[KMP_TASK_T_TY]], [{{[0-9]+}} x i8], [[PRIVATES_TMAIN_TY]] }
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] = {1, 2};
+  S<T> s_arr[] = {1, 2};
+  S<T> var(3);
+#pragma omp taskloop simd private(t_var, vec, s_arr, s_arr, var, var)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global double
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{( x86_thiscallcc)?}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// LAMBDA: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+// LAMBDA: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+// LAMBDA: ret
+#pragma omp taskloop simd private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // LAMBDA: define {{.+}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG_PTR:%.+]])
+    // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+    // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+    // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: [[G_REF:%.+]] = load double*, double** [[G_PTR_REF]]
+    // LAMBDA: store double 2.0{{.+}}, double* [[G_REF]]
+    // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+    // LAMBDA: store i{{[0-9]+}} 3, i{{[0-9]+}}* [[SIVAR_REF]]
+
+    // LAMBDA: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store double 1.0{{.+}}, double* %{{.+}},
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* %{{.+}},
+    // LAMBDA: call void [[INNER_LAMBDA]](%
+    // LAMBDA: ret
+    [&]() {
+      g = 2;
+      sivar = 3;
+    }();
+  }
+  }();
+  return 0;
+#elif defined(BLOCKS)
+  // BLOCKS: [[G:@.+]] = global double
+  // BLOCKS-LABEL: @main
+  // BLOCKS: call void {{%.+}}(i8
+  ^{
+  // BLOCKS: define{{.*}} internal{{.*}} void {{.+}}(i8*
+  // BLOCKS: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc(%{{[^ ]+}} @{{[^,]+}}, i32 %{{[^,]+}}, i32 1, i64 88, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, %{{[^*]+}}*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+  // BLOCKS: [[PRIVATES:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i{{.+}} 0, i{{.+}} 1
+  // BLOCKS: call void @__kmpc_taskloop(%{{.+}}* @{{.+}}, i32 %{{.+}}, i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* null)
+  // BLOCKS: ret
+#pragma omp taskloop simd private(g, sivar)
+  for (int i = 0; i < 10; ++i) {
+    // BLOCKS: define {{.+}} void {{@.+}}(i8*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store double 2.0{{.+}}, double*
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: ret
+
+    // BLOCKS: define internal i32 [[TASK_ENTRY]](i32, %{{.+}}* noalias)
+    g = 1;
+    sivar = 3;
+    // BLOCKS: store double 1.0{{.+}}, double* %{{.+}},
+    // BLOCKS-NOT: [[G]]{{[[^:word:]]}}
+    // BLOCKS: store i{{[0-9]+}} 3, i{{[0-9]+}}* %{{.+}},
+    // BLOCKS-NOT: [[SIVAR]]{{[[^:word:]]}}
+    // BLOCKS: call void {{%.+}}(i8
+    ^{
+      g = 2;
+      sivar = 4;
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<double> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<double> s_arr[] = {1, 2};
+  S<double> var(3);
+#pragma omp taskloop simd private(var, t_var, s_arr, vec, s_arr, var, sivar)
+  for (int i = 0; i < 10; ++i) {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 8;
+  }
+#pragma omp task
+  g+=1;
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR:@.+]]([[S_DOUBLE_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_MAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_MAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 112, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_MAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// Also copy address of private copy to the corresponding shareds reference.
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_MAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)* [[MAIN_DUP:@.+]] to i8*))
+// CHECK: call i32 @__kmpc_omp_task([[LOC]], i32 [[GTID]], i8*
+
+// CHECK: = call i{{.+}} [[TMAIN_INT:@.+]]()
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_DOUBLE_TY_DESTR:@.+]]([[S_DOUBLE_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_MAIN_TY]]* noalias, [[S_DOUBLE_TY]]** noalias, i32** noalias, [2 x [[S_DOUBLE_TY]]]** noalias, [2 x i32]** noalias, i32** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_MAIN_TY]]*, [[PRIVATES_MAIN_TY]]**
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_DOUBLE_TY]]]**, [2 x [[S_DOUBLE_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_DOUBLE_TY]]]* [[PRIV_S_VAR]], [2 x [[S_DOUBLE_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG1:%.+]] = load [[S_DOUBLE_TY]]**, [[S_DOUBLE_TY]]*** {{.+}},
+// CHECK: store [[S_DOUBLE_TY]]* [[PRIV_VAR]], [[S_DOUBLE_TY]]** [[ARG1]],
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG2:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG2]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+
+// CHECK: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_DOUBLE_TY]]*,
+// CHECK: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_DOUBLE_TY]]]*,
+// CHECK: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK: [[PRIV_SIVAR_ADDR:%.+]] = alloca i32*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_MAIN_TY]]*, [[S_DOUBLE_TY]]**, i32**, [2 x [[S_DOUBLE_TY]]]**, [2 x i32]**, i32**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]], i32** [[PRIV_T_VAR_ADDR]], [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], i32** [[PRIV_SIVAR_ADDR]])
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_DOUBLE_TY]]*, [[S_DOUBLE_TY]]** [[PRIV_VAR_ADDR]],
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_DOUBLE_TY]]]*, [2 x [[S_DOUBLE_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_SIVAR:%.+]] = load i32*, i32** [[PRIV_SIVAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+// CHECK_DAG: [[PRIV_SIVAR]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[MAIN_DUP]]([[KMP_TASK_MAIN_TY]]*, [[KMP_TASK_MAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_DOUBLE_TY]]*
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_DOUBLE_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* %{{.+}}, i32 0, i32 1
+// CHECK: call {{.*}} [[S_DOUBLE_TY_DEF_CONSTR]]([[S_DOUBLE_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_MAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_MAIN_TY]], [[KMP_TASK_MAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 0
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_MAIN_TY]], [[PRIVATES_MAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_DOUBLE_TY]]], [2 x [[S_DOUBLE_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_DOUBLE_TY]], [[S_DOUBLE_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_DOUBLE_TY_DESTR]]([[S_DOUBLE_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+// CHECK: define {{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[T_VAR_ADDR:%.+]] = alloca i32,
+// CHECK: [[VEC_ADDR:%.+]] = alloca [2 x i32],
+// CHECK: [[S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]],
+// CHECK: [[VAR_ADDR:%.+]] = alloca [[S_INT_TY]],
+// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num([[LOC:%.+]])
+
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+
+// Do not store original variables in capture struct.
+// CHECK-NOT: getelementptr inbounds [[CAP_TMAIN_TY]],
+
+// Allocate task.
+// Returns struct kmp_task_t {
+//         [[KMP_TASK_T_TY]] task_data;
+//         [[KMP_TASK_TMAIN_TY]] privates;
+//       };
+// CHECK: [[RES:%.+]] = call i8* @__kmpc_omp_task_alloc([[LOC]], i32 [[GTID]], i32 9, i64 256, i64 1, i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[TASK_ENTRY:@[^ ]+]] to i32 (i32, i8*)*))
+// CHECK: [[RES_KMP_TASK:%.+]] = bitcast i8* [[RES]] to [[KMP_TASK_TMAIN_TY]]*
+
+// CHECK: [[TASK:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+
+// Initialize kmp_task_t->privates with default values (no init for simple types, default constructors for classes).
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+
+// Constructors for s_arr and var.
+// a_arr;
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_CUR:%.+]])
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_CUR]], i{{.+}} 1
+// CHECK: icmp eq
+// CHECK: br i1
+
+// var;
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF:%.+]])
+
+// Provide pointer to destructor function, which will destroy private variables at the end of the task.
+// CHECK: [[DESTRUCTORS_REF:%.+]] = getelementptr inbounds [[KMP_TASK_T_TY]], [[KMP_TASK_T_TY]]* [[TASK]], i{{.+}} 0, i{{.+}} 3
+// CHECK: [[DESTRUCTORS_PTR:%.+]] = bitcast %union{{.+}}* [[DESTRUCTORS_REF]] to i32 (i32, i8*)**
+// CHECK: store i32 (i32, i8*)* bitcast (i32 (i32, [[KMP_TASK_TMAIN_TY]]*)* [[DESTRUCTORS:@.+]] to i32 (i32, i8*)*), i32 (i32, i8*)** [[DESTRUCTORS_PTR]],
+
+// Start task.
+// CHECK: call void @__kmpc_taskloop([[LOC]], i32 [[GTID]], i8* [[RES]], i32 1, i64* %{{.+}}, i64* %{{.+}}, i64 %{{.+}}, i32 0, i32 0, i64 0, i8* bitcast (void ([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)* [[TMAIN_DUP:@.+]] to i8*))
+
+// No destructors must be called for private copies of s_arr and var.
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]*
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK-NOT: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: ret
+//
+
+// CHECK: define internal void [[PRIVATES_MAP_FN:@.+]]([[PRIVATES_TMAIN_TY]]* noalias, i32** noalias, [2 x i32]** noalias, [2 x [[S_INT_TY]]]** noalias, [[S_INT_TY]]** noalias)
+// CHECK: [[PRIVATES:%.+]] = load [[PRIVATES_TMAIN_TY]]*, [[PRIVATES_TMAIN_TY]]**
+// CHECK: [[PRIV_T_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 0
+// CHECK: [[ARG1:%.+]] = load i32**, i32*** %{{.+}},
+// CHECK: store i32* [[PRIV_T_VAR]], i32** [[ARG1]],
+// CHECK: [[PRIV_VEC:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 1
+// CHECK: [[ARG2:%.+]] = load [2 x i32]**, [2 x i32]*** %{{.+}},
+// CHECK: store [2 x i32]* [[PRIV_VEC]], [2 x i32]** [[ARG2]],
+// CHECK: [[PRIV_S_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 2
+// CHECK: [[ARG3:%.+]] = load [2 x [[S_INT_TY]]]**, [2 x [[S_INT_TY]]]*** %{{.+}},
+// CHECK: store [2 x [[S_INT_TY]]]* [[PRIV_S_VAR]], [2 x [[S_INT_TY]]]** [[ARG3]],
+// CHECK: [[PRIV_VAR:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i32 0, i32 3
+// CHECK: [[ARG4:%.+]] = load [[S_INT_TY]]**, [[S_INT_TY]]*** {{.+}},
+// CHECK: store [[S_INT_TY]]* [[PRIV_VAR]], [[S_INT_TY]]** [[ARG4]],
+// CHECK: ret void
+
+// CHECK: define internal i32 [[TASK_ENTRY]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+
+// CHECK: alloca i32*,
+// CHECK-DAG: [[PRIV_T_VAR_ADDR:%.+]] = alloca i32*,
+// CHECK-DAG: [[PRIV_VEC_ADDR:%.+]] = alloca [2 x i32]*,
+// CHECK-DAG: [[PRIV_S_ARR_ADDR:%.+]] = alloca [2 x [[S_INT_TY]]]*,
+// CHECK-DAG: [[PRIV_VAR_ADDR:%.+]] = alloca [[S_INT_TY]]*,
+// CHECK: store void (i8*, ...)* bitcast (void ([[PRIVATES_TMAIN_TY]]*, i32**, [2 x i32]**, [2 x [[S_INT_TY]]]**, [[S_INT_TY]]**)* [[PRIVATES_MAP_FN]] to void (i8*, ...)*), void (i8*, ...)** [[MAP_FN_ADDR:%.+]],
+// CHECK: [[MAP_FN:%.+]] = load void (i8*, ...)*, void (i8*, ...)** [[MAP_FN_ADDR]],
+// CHECK: call void (i8*, ...) [[MAP_FN]](i8* %{{.+}}, i32** [[PRIV_T_VAR_ADDR]], [2 x i32]** [[PRIV_VEC_ADDR]], [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]], [[S_INT_TY]]** [[PRIV_VAR_ADDR]])
+// CHECK: [[PRIV_T_VAR:%.+]] = load i32*, i32** [[PRIV_T_VAR_ADDR]],
+// CHECK: [[PRIV_VEC:%.+]] = load [2 x i32]*, [2 x i32]** [[PRIV_VEC_ADDR]],
+// CHECK: [[PRIV_S_ARR:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** [[PRIV_S_ARR_ADDR]],
+// CHECK: [[PRIV_VAR:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** [[PRIV_VAR_ADDR]],
+
+// Privates actually are used.
+// CHECK-DAG: [[PRIV_VAR]]
+// CHECK-DAG: [[PRIV_T_VAR]]
+// CHECK-DAG: [[PRIV_S_ARR]]
+// CHECK-DAG: [[PRIV_VEC]]
+
+// CHECK: ret
+
+// CHECK: define internal void [[TMAIN_DUP]]([[KMP_TASK_TMAIN_TY]]*, [[KMP_TASK_TMAIN_TY]]*, i32)
+// CHECK: getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 2
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* %{{.+}}, i32 0, i32 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 2
+// CHECK: br label %
+
+// CHECK: phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i64 1
+// CHECK: icmp eq [[S_INT_TY]]* %
+// CHECK: br i1 %
+
+// CHECK: getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* %{{.+}}, i32 0, i32 3
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal i32 [[DESTRUCTORS]](i32, [[KMP_TASK_TMAIN_TY]]* noalias)
+// CHECK: [[PRIVATES:%.+]] = getelementptr inbounds [[KMP_TASK_TMAIN_TY]], [[KMP_TASK_TMAIN_TY]]* [[RES_KMP_TASK:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 2
+// CHECK: [[PRIVATE_S_ARR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 2
+// CHECK: [[PRIVATE_VAR_REF:%.+]] = getelementptr inbounds [[PRIVATES_TMAIN_TY]], [[PRIVATES_TMAIN_TY]]* [[PRIVATES]], i{{.+}} 0, i{{.+}} 3
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_VAR_REF]])
+// CHECK: getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[PRIVATE_S_ARR_REF]], i{{.+}} 0, i{{.+}} 0
+// CHECK: getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} 2
+// CHECK: [[PRIVATE_S_ARR_ELEM_REF:%.+]] = getelementptr inbounds [[S_INT_TY]], [[S_INT_TY]]* %{{.+}}, i{{.+}} -1
+// CHECK: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]* [[PRIVATE_S_ARR_ELEM_REF]])
+// CHECK: icmp eq
+// CHECK: br i1
+// CHECK: ret i32
+
+#endif
+#else
+// ARRAY-LABEL: array_func
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St &operator=(const St &) { return *this; };
+  ~St() {}
+};
+
+void array_func(int n, float a[n], St s[2]) {
+// ARRAY: call i8* @__kmpc_omp_task_alloc(
+// ARRAY: call void @__kmpc_taskloop(
+// ARRAY: store float** %{{.+}}, float*** %{{.+}},
+// ARRAY: store %struct.St** %{{.+}}, %struct.St*** %{{.+}},
+#pragma omp taskloop simd private(a, s)
+  for (int i = 0; i < 10; ++i)
+    ;
+}
+#endif
+
diff --git a/test/OpenMP/taskloop_simd_private_messages.cpp b/test/OpenMP/taskloop_simd_private_messages.cpp
index 4a9b08a7e9e29..ba9e8da17e205 100644
--- a/test/OpenMP/taskloop_simd_private_messages.cpp
+++ b/test/OpenMP/taskloop_simd_private_messages.cpp
@@ -29,7 +29,11 @@ class S4 {
   S4(); // expected-note {{implicitly declared private here}}
 
 public:
-  S4(int v) : a(v) {}
+  S4(int v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
 };
 class S5 {
   int a;
@@ -37,6 +41,50 @@ class S5 {
 
 public:
   S5(int v) : a(v) {}
+  S5 &operator=(S5 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S6 {
+public:
+  T a;
+
+  S6() : a(0) {}
+  S6(T v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a)
+    for (int k = 0; k < v; ++k)
+      ++this->a;
+  }
+  S6 &operator=(S6 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) // expected-error {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a; ++k)
+      ++s.a;
+    return *this;
+  }
+};
+
+template <typename T>
+class S7 : public T {
+  T a;
+  S7() : a(0) {}
+
+public:
+  S7(T v) : a(v) {
+#pragma omp taskloop simd private(a) private(this->a) private(T::a)
+    for (int k = 0; k < a.a; ++k)
+      ++this->a.a;
+  }
+  S7 &operator=(S7 &s) {
+#pragma omp taskloop simd private(a) private(this->a) private(s.a) private(s.T::a) // expected-error 2 {{expected variable name or data member of current class}}
+    for (int k = 0; k < s.a.a; ++k)
+      ++s.a.a;
+    return *this;
+  }
 };
 
 S3 h;
@@ -126,6 +174,8 @@ using A::x;
 int main(int argc, char **argv) {
   S4 e(4);
   S5 g(5);
+  S6<float> s6(0.0) , s6_0(1.0);
+  S7<S6<float> > s7(0.0) , s7_0(1.0);
   int i;
   int &j = i;
 #pragma omp taskloop simd private // expected-error {{expected '(' after 'private'}}
@@ -190,6 +240,8 @@ int main(int argc, char **argv) {
   for(int k = 0; k < argc; ++k)
     si = k + 1;
 
-  return 0;
+  s6 = s6_0; // expected-note {{in instantiation of member function 'S6<float>::operator=' requested here}}
+  s7 = s7_0; // expected-note {{in instantiation of member function 'S7<S6<float> >::operator=' requested here}}
+  return foomain(argc, argv); // expected-note {{in instantiation of function template specialization 'foomain<int, char>' requested here}}
 }
 
diff --git a/test/OpenMP/taskloop_simd_safelen_messages.cpp b/test/OpenMP/taskloop_simd_safelen_messages.cpp
index 3182c8af80fa3..729f31407bfaa 100644
--- a/test/OpenMP/taskloop_simd_safelen_messages.cpp
+++ b/test/OpenMP/taskloop_simd_safelen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
-  // expected-error@+2 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
+  // expected-error@+5 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd safelen (foobool(argc)), safelen (true), safelen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd safelen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd safelen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp taskloop simd safelen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd safelen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'safelen' clause}}
   // expected-error@+1 2 {{argument to 'safelen' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd safelen (foobool(argc)), safelen (true), safelen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd safelen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd safelen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/taskloop_simd_simdlen_messages.cpp b/test/OpenMP/taskloop_simd_simdlen_messages.cpp
index ba3f20e23cfe1..79655ba9c275b 100644
--- a/test/OpenMP/taskloop_simd_simdlen_messages.cpp
+++ b/test/OpenMP/taskloop_simd_simdlen_messages.cpp
@@ -1,8 +1,13 @@
 // RUN: %clang_cc1 -verify -fopenmp %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++98 %s
+// RUN: %clang_cc1 -verify -fopenmp -std=c++11 %s
 
 void foo() {
 }
 
+#if __cplusplus >= 201103L
+// expected-note@+2 4 {{declared here}}
+#endif
 bool foobool(int argc) {
   return argc;
 }
@@ -29,14 +34,21 @@ T tmain(T argc, S **argv) { //expected-note 2 {{declared here}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen ((ST > 0) ? 1 + ST : 2)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+3 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
-  // expected-error@+2 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+  // expected-error@+6 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
+  // expected-error@+5 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   #pragma omp taskloop simd simdlen (foobool(argc)), simdlen (true), simdlen (-5)
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen (S) // expected-error {{'S' does not refer to a value}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
-  // expected-error@+1 2 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 2 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = ST; i < N; i++) argv[0][i] = argv[0][i] - argv[0][i-ST];
   #pragma omp taskloop simd simdlen (4)
@@ -57,16 +69,27 @@ int main(int argc, char **argv) {
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd simdlen (2+2)) // expected-warning {{extra tokens at the end of '#pragma omp taskloop simd' are ignored}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  #pragma omp taskloop simd simdlen (foobool(1) > 0 ? 1 : 2) // expected-error {{expression is not an integral constant expression}}
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+2 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
+  #pragma omp taskloop simd simdlen (foobool(1) > 0 ? 1 : 2)
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+3 {{expression is not an integral constant expression}}
+  // expected-error@+6 {{expression is not an integral constant expression}}
+#if __cplusplus >= 201103L
+  // expected-note@+4 {{non-constexpr function 'foobool' cannot be used in a constant expression}}
+#endif
   // expected-error@+2 2 {{directive '#pragma omp taskloop simd' cannot contain more than one 'simdlen' clause}}
   // expected-error@+1 2 {{argument to 'simdlen' clause must be a strictly positive integer value}}
   #pragma omp taskloop simd simdlen (foobool(argc)), simdlen (true), simdlen (-5) 
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   #pragma omp taskloop simd simdlen (S1) // expected-error {{'S1' does not refer to a value}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
-  // expected-error@+1 {{expression is not an integral constant expression}}
+#if __cplusplus <= 199711L
+  // expected-error@+4 {{expression is not an integral constant expression}}
+#else
+  // expected-error@+2 {{integral constant expression must have integral or unscoped enumeration type, not 'char *'}}
+#endif
   #pragma omp taskloop simd simdlen (argv[1]=2) // expected-error {{expected ')'}} expected-note {{to match this '('}}
   for (int i = 4; i < 12; i++) argv[0][i] = argv[0][i] - argv[0][i-4];
   // expected-error@+3 {{statement after '#pragma omp taskloop simd' must be a for loop}}
diff --git a/test/OpenMP/teams_ast_print.cpp b/test/OpenMP/teams_ast_print.cpp
index 292586ae530d9..f3d577cafcd35 100644
--- a/test/OpenMP/teams_ast_print.cpp
+++ b/test/OpenMP/teams_ast_print.cpp
@@ -109,4 +109,6 @@ int main (int argc, char **argv) {
   return tmain<int, 5>(b, &b) + tmain<long, 1>(x, &x);
 }
 
+extern template int S<int>::TS;
+extern template long S<long>::TS;
 #endif
diff --git a/test/OpenMP/teams_codegen.cpp b/test/OpenMP/teams_codegen.cpp
new file mode 100644
index 0000000000000..8aaa206abc3b4
--- /dev/null
+++ b/test/OpenMP/teams_codegen.cpp
@@ -0,0 +1,353 @@
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+// Test host codegen.
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK1 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
+#ifdef CK1
+
+int Gbla;
+long long Gblb;
+int &Gblc = Gbla;
+
+// CK1-LABEL: teams_argument_global_local
+int teams_argument_global_local(int a){
+  int comp = 1;
+
+  int la = 23;
+  float lc = 25.0;
+
+  // CK1: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams
+  {
+    ++comp;
+  }
+
+  // CK1: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 1, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 0)
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  {{{
+    #pragma omp teams
+    {
+      ++comp;
+    }
+  }}}
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 0)
+  // CK1-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(la)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 0, i32 [[NT:%[^,]+]])
+  // CK1-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+
+  // CK1: call void @{{.+}}(i{{64|32}} %{{.+}})
+  #pragma omp target
+  #pragma omp teams thread_limit(la)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 5, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK1-DAG: [[NT]] = add nsw i32 [[NTA:%[^,]+]], [[NTB:%[^,]+]]
+  // CK1-DAG: [[NTA]] = load i32, i32* @Gbla,
+  // CK1-DAG: [[NTB]] = load i32, i32* %{{.+}},
+
+  // CK1-DAG: [[TL]] = trunc i64 [[TLA:%[^,]+]] to i32
+  // CK1-DAG: [[TLA]] = add nsw i64 [[TLB:%[^,]+]], [[TLC:%[^,]+]]
+  // CK1-DAG: [[TLC]] = fptosi float [[TLD:%[^,]+]] to i64
+  // CK1-DAG: [[TLD]] = load float, float* %{{.+}},
+  // CK1-DAG: [[TLB]] = load i64, i64* @Gblb,
+
+  // CK1: call void @{{.+}}(i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}}, i{{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(Gbla+a) thread_limit(Gblb+(long long)lc)
+  {
+    ++comp;
+  }
+
+  // CK1-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 {{.+}}, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK1-DAG: [[NT]] = add nsw i32 [[NTA:%[^,]+]], 1
+  // CK1-DAG: [[NTA]] = load i32, i32* @Gbla,
+
+  // CK1-DAG: [[TL]] = add nsw i32 [[TLA:%[^,]+]], 2
+  // CK1-DAG: [[TLA]] = load i32, i32* @Gbla,
+
+  // CK1: call void @{{.+}}(i{{.+}} {{.+}}
+  #pragma omp target
+  #pragma omp teams num_teams(Gblc+1) thread_limit(Gblc+2)
+  {
+    comp += Gblc;
+  }
+
+  return comp;
+}
+
+#endif // CK1
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK2 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
+#ifdef CK2
+
+// CK2-DAG: [[SSI:%.+]] = type { i32, float }
+// CK2-DAG: [[SSL:%.+]] = type { i64, float }
+template <typename T>
+struct SS{
+  T a;
+  float b;
+};
+
+SS<int> Gbla;
+SS<long long> Gblb;
+
+// CK2-LABEL: teams_template_arg
+int teams_template_arg(void) {
+  int comp = 1;
+
+  SS<int> la;
+  SS<long long> lb;
+
+  // CK2-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK2-DAG: [[NT]] = load i32, i32* getelementptr inbounds ([[SSI]], [[SSI]]* @Gbla, i32 0, i32 0)
+
+  // CK2-DAG: [[TL]] = trunc i64 [[TLA:%[^,]+]] to i32
+  // CK2-DAG: [[TLA]] = fptosi float [[TLB:%[^,]+]] to i64
+  // CK2-DAG: [[TLB]] = load float, float* [[TLC:%[^,]+]],
+  // CK2-DAG: [[TLC]] = getelementptr inbounds [[SSI]], [[SSI]]* %{{.+}}, i32 0, i32 1
+
+  // CK2: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}}, {{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams(Gbla.a) thread_limit((long long)la.b)
+  {
+    ++comp;
+  }
+
+  // CK2-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 3, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 [[TL:%[^,]+]])
+
+  // CK2-DAG: [[TL]] = trunc i64 [[TLD:%[^,]+]] to i32
+  // CK2-DAG: [[TLD]] = load i64, i64* getelementptr inbounds ([[SSL]], [[SSL]]* @Gblb, i32 0, i32 0),
+
+  // CK2-DAG: [[NT]] = trunc i64 [[NTA:%[^,]+]] to i32
+  // CK2-DAG: [[NTA]] = fptosi float [[NTB:%[^,]+]] to i64
+  // CK2-DAG: [[NTB]] = load float, float* [[NTC:%[^,]+]],
+  // CK2-DAG: [[NTC]] = getelementptr inbounds [[SSL]], [[SSL]]* %{{.+}}, i32 0, i32 1
+
+  // CK2: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}}, {{.+}} {{.+}})
+  #pragma omp target
+  #pragma omp teams num_teams((long long)lb.b) thread_limit(Gblb.a)
+  {
+    ++comp;
+  }
+  return comp;
+}
+#endif // CK2
+
+// Test host codegen.
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-64
+// RUN: %clang_cc1 -DCK3 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-32
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DCK3 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK3 --check-prefix CK3-32
+#ifdef CK3
+
+// CK3: [[SSI:%.+]] = type { i32, float }
+// CK3-LABEL: teams_template_struct
+
+template <typename T, int X, long long Y>
+struct SS{
+  T a;
+  float b;
+
+  int foo(void) {
+    int comp = 1;
+
+    // CK3-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 [[NT:%[^,]+]], i32 123)
+
+    // CK3-DAG: [[NT]] = load i32, i32* [[NTA:%[^,]+]],
+    // CK3-DAG: [[NTA]] = getelementptr inbounds [[SSI]], [[SSI]]* [[NTB:%[^,]+]], i32 0, i32 0
+    // CK3-DAG: [[NTB]] = load [[SSI]]*, [[SSI]]** %{{.+}},
+
+    // CK3: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}})
+    #pragma omp target
+    #pragma omp teams num_teams(a) thread_limit(X)
+    {
+      ++comp;
+    }
+
+    // CK3-DAG: call i32 @__tgt_target_teams(i32 -1, i8* @{{[^,]+}}, i32 2, i8** %{{[^,]+}}, i8** %{{[^,]+}}, i{{64|32}}* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32* {{.+}}@{{[^,]+}}, i32 0, i32 0), i32 456, i32 [[TL:%[^,]+]])
+
+    // CK3-DAG: [[TL]] = add nsw i32 [[TLA:%[^,]+]], 123
+    // CK3-DAG: [[TLA]] = fptosi float [[TLB:%[^,]+]] to i32
+    // CK3-DAG: [[TLB]] = load float, float* [[TLC:%[^,]+]],
+    // CK3-DAG: [[TLC]] = getelementptr inbounds [[SSI]], [[SSI]]* [[THIS:%[^,]+]], i32 0, i32 1
+
+    // CK3: call void @{{.+}}({{.+}} {{.+}}, {{.+}} {{.+}})
+    #pragma omp target
+    #pragma omp teams num_teams(Y) thread_limit((int)b+X)
+    {
+      ++comp;
+    }
+    return comp;
+  }
+};
+
+int teams_template_struct(void) {
+  SS<int, 123, 456> V;
+  return V.foo();
+
+}
+#endif // CK3
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-64
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK4 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK4 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK4 --check-prefix CK4-32
+
+#ifdef CK4
+
+// CK4-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CK4-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CK4-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK4-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
+// CK4-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
+
+template <typename T>
+int tmain(T argc) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return 0;
+}
+
+int main (int argc, char **argv) {
+#pragma omp target
+#pragma omp teams
+  argc = 0;
+  return tmain(argv);
+}
+
+// CK4:  define {{.*}}void @{{[^,]+}}(i{{.+}} %[[ARGC:.+]])
+// CK4:  [[ARGCADDR:%.+]] = alloca i{{.+}}
+// CK4:  store i{{.+}} %[[ARGC]], i{{.+}}* [[ARGCADDR]]
+// CK4-64:  [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK4-64:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[CONV]])
+// CK4-32:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* {{.+}} to void (i32*, i32*, ...)*), i32* [[ARGCADDR]])
+// CK4:  ret void
+// CK4-NEXT: }
+
+// CK4:  define {{.*}}void @{{[^,]+}}(i8** [[ARGC1:%.+]])
+// CK4:  [[ARGCADDR1:%.+]] = alloca i8**
+// CK4:  store i8** [[ARGC1]], i8*** [[ARGCADDR1]]
+// CK4:  call {{.*}}void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i8***)* {{.+}} to void (i32*, i32*, ...)*), i8*** [[ARGCADDR1]])
+
+
+#endif // CK4
+
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-64
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -DCK5 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o %t %s
+// RUN: %clang_cc1 -DCK5 -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CK5 --check-prefix CK5-32
+
+// expected-no-diagnostics
+#ifdef CK5
+
+// CK5-DAG: %ident_t = type { i32, i32, i32, i32, i8* }
+// CK5-DAG: [[STR:@.+]] = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00"
+// CK5-DAG: [[DEF_LOC_0:@.+]] = private unnamed_addr constant %ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* [[STR]], i32 0, i32 0) }
+// CK5-DEBUG-DAG: [[LOC1:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;main;[[@LINE+14]];9;;\00"
+// CK5-DEBUG-DAG: [[LOC2:@.+]] = private unnamed_addr constant [{{.+}} x i8] c";{{.*}}teams_codegen.cpp;tmain;[[@LINE+7]];9;;\00"
+
+template <typename T>
+int tmain(T argc) {
+  int a = 10;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return 0;
+}
+
+int main (int argc, char **argv) {
+  int a = 20;
+  int b = 5;
+#pragma omp target
+#pragma omp teams num_teams(a) thread_limit(b)
+  {
+  argc = 0;
+  }
+  return tmain(argv);
+}
+
+// CK5:  define {{.*}}void @{{[^,]+}}(i{{.+}} [[AP:%.+]], i{{.+}} [[BP:%.+]], i{{.+}} [[ARGC:.+]])
+// CK5:  [[AADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[BADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[ARGCADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]])
+// CK5:  store i{{.+}} [[AP]], i{{.+}}* [[AADDR]]
+// CK5:  store i{{.+}} [[BP]], i{{.+}}* [[BADDR]]
+// CK5:  store i{{.+}} [[ARGC]], i{{.+}}* [[ARGCADDR]]
+// CK5-64:  [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK5-64:  [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK5-64:  [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
+// CK5-64:  [[ACONVVAL:%.+]] = load i32, i32* [[ACONV]]
+// CK5-64:  [[BCONVVAL:%.+]] = load i32, i32* [[BCONV]]
+// CK5-32:  [[ACONVVAL:%.+]] = load i32, i32* [[AADDR]]
+// CK5-32:  [[BCONVVAL:%.+]] = load i32, i32* [[BADDR]]
+// CK5:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[ACONVVAL]], i32 [[BCONVVAL]])
+// CK5-64:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[CONV]])
+// CK5-32:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* [[ARGCADDR]])
+
+// CK5:  define {{.*}}void @{{[^,]+}}(i{{.+}} [[AP:%.+]], i{{.+}} [[BP:%.+]], i{{.+}}** [[ARGC:%.+]])
+// CK5:  [[AADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[BADDR:%.+]] = alloca i{{.+}}
+// CK5:  [[ARGCADDR:%.+]] = alloca i{{.+}}**
+// CK5:  [[GBL_TH_NUM:%.+]] = call i32 @__kmpc_global_thread_num(%ident_t* [[DEF_LOC_0]])
+// CK5:  store i{{.+}} [[AP]], i{{.+}}* [[AADDR]]
+// CK5:  store i{{.+}} [[BP]], i{{.+}}* [[BADDR]]
+// CK5:  store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
+// CK5-64:  [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
+// CK5-64:  [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
+// CK5-64:  [[ACONVVAL:%.+]] = load i32, i32* [[ACONV]]
+// CK5-64:  [[BCONVVAL:%.+]] = load i32, i32* [[BCONV]]
+// CK5-64:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[ACONVVAL]], i32 [[BCONVVAL]])
+// CK5-32:  [[A_VAL:%.+]] = load i32, i32* [[AADDR]]
+// CK5-32:  [[B_VAL:%.+]] = load i32, i32* [[BADDR]]
+// CK5-32:  {{.+}} = call i32 @__kmpc_push_num_teams(%ident_t* [[DEF_LOC_0]], i32 [[GBL_TH_NUM]], i32 [[A_VAL]], i32 [[B_VAL]])
+// CK5:  call void (%ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%ident_t* [[DEF_LOC_0]], i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i{{.+}})* @.omp_outlined.{{.+}} to void (i32*, i32*, ...)*), i{{.+}}*** [[ARGCADDR]])
+// CK5:  ret void
+// CK5-NEXT: }
+
+#endif // CK5
+#endif
diff --git a/test/OpenMP/teams_firstprivate_codegen.cpp b/test/OpenMP/teams_firstprivate_codegen.cpp
new file mode 100644
index 0000000000000..3248bfeebd91f
--- /dev/null
+++ b/test/OpenMP/teams_firstprivate_codegen.cpp
@@ -0,0 +1,283 @@
+// Test host codegen.
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+
+// RUN: %clang_cc1 -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-64
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-64
+// RUN: %clang_cc1 -DARRAY  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-32
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DARRAY  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix ARRAY --check-prefix ARRAY-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+#ifndef ARRAY
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &st) : a(st.a + st.b), b(0) {}
+  ~St() {}
+};
+
+volatile int g __attribute__((aligned(128))) = 1212;
+
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a + g) {}
+  S() : f(g) {}
+  S(const S &s, St t = St()) : f(s.f + t.a) {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+// CHECK-DAG: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK-DAG: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK-DAG: [[ST_TY:%.+]] = type { i{{[0-9]+}}, i{{[0-9]+}} }
+
+template <typename T>
+T tmain() {
+  S<T> test;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] __attribute__((aligned(128))) = {1, 2};
+  S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
+  S<T> var __attribute__((aligned(128))) (3);
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+#pragma omp target
+#pragma omp teams firstprivate(t_var)
+  {}
+  return T();
+}
+
+int main() {
+  static int sivar;
+#ifdef LAMBDA
+  // LAMBDA-LABEL: @main
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@.+]](
+  [&]() {    
+  // LAMBDA: define{{.*}} internal{{.*}} void [[OUTER_LAMBDA]](
+  // LAMBDA: call {{.*}}void {{.+}} @__kmpc_fork_teams({{.+}}, i32 2, {{.+}}* [[OMP_REGION:@.+]] to {{.+}}, i32* {{.+}}, {{.+}})    
+  #pragma omp target
+  #pragma omp teams firstprivate(g, sivar)
+  {
+    // LAMBDA: define{{.*}} internal{{.*}} void [[OMP_REGION]](i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* dereferenceable(4) [[G_IN:%.+]], i{{64|32}} {{.*}}[[SIVAR_IN:%.+]])
+    // LAMBDA: store i{{[0-9]+}}* [[G_IN]], i{{[0-9]+}}** [[G_ADDR:%.+]],
+    // LAMBDA: store i{{[0-9]+}} [[SIVAR_IN]], i{{[0-9]+}}* [[SIVAR_ADDR:%.+]],
+    // LAMBDA: [[G_ADDR_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_ADDR]],
+    // LAMBDA-64: [[SIVAR_CONV:%.+]] = bitcast i64*  [[SIVAR_ADDR]] to i32*
+    // LAMBDA: [[G_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[G_ADDR_VAL]],
+    // LAMBDA: store i{{[0-9]+}} [[G_VAL]], i{{[0-9]+}}* [[G_LOCAL:%.+]],
+    g = 1;
+    sivar = 2;
+    // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_LOCAL]],
+    // LAMBDA-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_CONV]],
+    // LAMBDA-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_ADDR]],
+    // LAMBDA: [[G_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+    // LAMBDA: store i{{[0-9]+}}* [[G_LOCAL]], i{{[0-9]+}}** [[G_PRIVATE_ADDR_REF]]
+    // LAMBDA: [[SIVAR_PRIVATE_ADDR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG:%.+]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+    // LAMBDA-64: store i{{[0-9]+}}* [[SIVAR_CONV]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA-32: store i{{[0-9]+}}* [[SIVAR_ADDR]], i{{[0-9]+}}** [[SIVAR_PRIVATE_ADDR_REF]]
+    // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@.+]](%{{.+}}* [[ARG]])
+    [&]() {
+      // LAMBDA: define {{.+}} void [[INNER_LAMBDA]](%{{.+}}* [[ARG_PTR:%.+]])
+      // LAMBDA: store %{{.+}}* [[ARG_PTR]], %{{.+}}** [[ARG_PTR_REF:%.+]],
+      g = 2;
+      sivar = 4;
+      // LAMBDA: [[ARG_PTR:%.+]] = load %{{.+}}*, %{{.+}}** [[ARG_PTR_REF]]
+      // LAMBDA: [[G_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+      // LAMBDA: [[G_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[G_PTR_REF]]
+      // LAMBDA: [[SIVAR_PTR_REF:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* [[ARG_PTR]], i{{[0-9]+}} 0, i{{[0-9]+}} 1
+      // LAMBDA: [[SIVAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[SIVAR_PTR_REF]]
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}* [[SIVAR_REF]]
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var, vec, s_arr, var, sivar)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 2;
+  }
+  #pragma omp target
+  #pragma omp teams firstprivate(t_var)
+  {}
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 5, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i{{32|64}}, [2 x [[S_FLOAT_TY]]]*, [[S_FLOAT_TY]]*, i{{[0-9]+}})* [[OMP_OUTLINED:@.+]] to void
+// CHECK: ret
+//
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i{{32|64}} {{.*}}%{{.+}}, [2 x [[S_FLOAT_TY]]]* dereferenceable(8) %{{.+}}, [[S_FLOAT_TY]]* dereferenceable(4) %{{.+}}, i{{32|64}} {{.*}}[[SIVAR:%.+]])
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[SIVAR7_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
+// CHECK-64: [[T_VAR_CONV:%.+]] = bitcast i64* [[T_VAR_PRIV]] to i32*
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_FLOAT_TY]]]*, [2 x [[S_FLOAT_TY]]]** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_FLOAT_TY]]*, [[S_FLOAT_TY]]** %
+// CHECK-64: [[SIVAR7_CONV:%.+]] = bitcast i64* [[SIVAR7_PRIV]] to i32*
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]],
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_FLOAT_TY]]], [2 x [[S_FLOAT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_FLOAT_TY]]]* [[S_ARR_REF]] to [[S_FLOAT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_FLOAT_TY]], [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_FLOAT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR:@.+]]([[S_FLOAT_TY]]* {{.+}}, [[S_FLOAT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_FLOAT_TY_COPY_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]], [[S_FLOAT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+
+// CHECK-64: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_CONV]],
+// CHECK-32: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR7_PRIV]],
+
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_1:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}})* [[OMP_OUTLINED_1:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_1]](i{{[0-9]+}}* noalias {{%.+}}, i{{[0-9]+}}* noalias {{%.+}}, i{{32|64}} {{.*}}[[T_VAR:%.+]])
+// CHECK: [[T_VAR_LOC:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}} [[T_VAR]], i{{[0-9]+}}* [[T_VAR_LOC]],
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_2:@.+]](i{{[0-9]+}}* {{.+}} {{%.+}}, [2 x i32]* {{.+}} {{%.+}}, [2 x [[S_INT_TY]]]* {{.+}} {{%.+}}, [[S_INT_TY]]* {{.+}} {{%.+}})
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 4, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [2 x i32]*, i32*, [2 x [[S_INT_TY]]]*, [[S_INT_TY]]*)* [[OMP_OUTLINED_2:@.+]] to void
+// CHECK: ret
+
+//
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_2]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, [2 x i32]* dereferenceable(8) %{{.+}}, i32* dereferenceable(4) %{{.+}}, [2 x [[S_INT_TY]]]* dereferenceable(8) %{{.+}}, [[S_INT_TY]]* dereferenceable(4) %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_ADDR:%.+]],
+
+// CHECK: [[VEC_REF:%.+]] = load [2 x i{{[0-9]+}}]*, [2 x i{{[0-9]+}}]** %
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** %
+// CHECK: [[S_ARR_REF:%.+]] = load [2 x [[S_INT_TY]]]*, [2 x [[S_INT_TY]]]** %
+// CHECK: [[VAR_REF:%.+]] = load [[S_INT_TY]]*, [[S_INT_TY]]** %
+
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]], align 128
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_PRIV]], align 128
+// CHECK: [[VEC_DEST:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_PRIV]] to i8*
+// CHECK: [[VEC_SRC:%.+]] = bitcast [2 x i{{[0-9]+}}]* [[VEC_REF]] to i8*
+// CHECK: call void @llvm.memcpy.{{.+}}(i8* [[VEC_DEST]], i8* [[VEC_SRC]], i{{[0-9]+}} {{[0-9]+}}, i{{[0-9]+}} 128,
+// CHECK: [[S_ARR_PRIV_BEGIN:%.+]] = getelementptr inbounds [2 x [[S_INT_TY]]], [2 x [[S_INT_TY]]]* [[S_ARR_PRIV]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[S_ARR_BEGIN:%.+]] = bitcast [2 x [[S_INT_TY]]]* [[S_ARR_REF]] to [[S_INT_TY]]*
+// CHECK: [[S_ARR_PRIV_END:%.+]] = getelementptr [[S_INT_TY]], [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], i{{[0-9]+}} 2
+// CHECK: [[IS_EMPTY:%.+]] = icmp eq [[S_INT_TY]]* [[S_ARR_PRIV_BEGIN]], [[S_ARR_PRIV_END]]
+// CHECK: br i1 [[IS_EMPTY]], label %[[S_ARR_BODY_DONE:.+]], label %[[S_ARR_BODY:.+]]
+// CHECK: [[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, [[S_INT_TY]]* {{.+}}, [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR:@.+]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: br i1 {{.+}}, label %{{.+}}, label %[[S_ARR_BODY]]
+// CHECK: call {{.*}} [[ST_TY_DEFAULT_CONSTR]]([[ST_TY]]* [[ST_TY_TEMP:%.+]])
+// CHECK: call {{.*}} [[S_INT_TY_COPY_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]], [[S_INT_TY]]* {{.*}} [[VAR_REF]], [[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK: call {{.*}} [[ST_TY_DESTR]]([[ST_TY]]* [[ST_TY_TEMP]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call {{.*}} [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret void
+
+// CHECK: define internal {{.*}}void [[OMP_OFFLOADING_3:@.+]](
+// CHECK: call {{.*}}void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_3:@.+]] to void
+// CHECK: ret
+
+// CHECK: define internal {{.*}}void [[OMP_OUTLINED_3]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}}, i32* dereferenceable(4) [[T_VAR:%.+]])
+// CHECK: [[T_VAR_LOC:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[T_VAR]], i{{[0-9]+}}** [[T_VAR_ADDR:%.+]],
+// CHECK: [[T_VAR_REF:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[T_VAR_ADDR]],
+// CHECK: [[T_VAR_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[T_VAR_REF]],
+// CHECK: store i{{[0-9]+}} [[T_VAR_VAL]], i{{[0-9]+}}* [[T_VAR_LOC]],
+// CHECK: ret
+
+#else
+struct St {
+  int a, b;
+  St() : a(0), b(0) {}
+  St(const St &) { }
+  ~St() {}
+  void St_func(St s[2], int n, long double vla1[n]) {
+    double vla2[n][n] __attribute__((aligned(128)));
+    a = b;
+    #pragma omp target
+    #pragma omp teams firstprivate(s, vla1, vla2)
+    vla1[b] = vla2[1][n - 1] = a = b;
+  }
+};
+
+void array_func(float a[3], St s[2], int n, long double vla1[n]) {
+  double vla2[n][n] __attribute__((aligned(128)));
+// ARRAY: call {{.+}} @__kmpc_fork_teams(
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-64-DAG: [[PRIV_VLA1:%.+]] = alloca ppc_fp128*,
+// ARRAY-32-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_A:%.+]] = alloca float*,
+// ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
+// ARRAY-DAG: store float* %{{.+}}, float** [[PRIV_A]],
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-64-DAG: store ppc_fp128* %{{.+}}, ppc_fp128** [[PRIV_VLA1]],
+// ARRAY-32-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
+// ARRAY: call i8* @llvm.stacksave()
+// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+  #pragma omp target
+  #pragma omp teams firstprivate(a, s, vla1, vla2)
+  s[0].St_func(s, n, vla1);
+  ;
+}
+
+// ARRAY: @__kmpc_fork_teams(
+// ARRAY-DAG: [[PRIV_S:%.+]] = alloca %struct.St*,
+// ARRAY-64-DAG: [[PRIV_VLA1:%.+]] = alloca ppc_fp128*,
+// ARRAY-32-DAG: [[PRIV_VLA1:%.+]] = alloca x86_fp80*,
+// ARRAY-DAG: [[PRIV_VLA2:%.+]] = alloca double*,
+// ARRAY-DAG: store %struct.St* %{{.+}}, %struct.St** [[PRIV_S]],
+// ARRAY-64-DAG: store ppc_fp128* %{{.+}}, ppc_fp128** [[PRIV_VLA1]],
+// ARRAY-32-DAG: store x86_fp80* %{{.+}}, x86_fp80** [[PRIV_VLA1]],
+// ARRAY-DAG: store double* %{{.+}}, double** [[PRIV_VLA2]],
+// ARRAY: call i8* @llvm.stacksave()
+// ARRAY: [[SIZE:%.+]] = mul nuw i{{[0-9]+}} %{{.+}}, 8
+// ARRAY: call void @llvm.memcpy.p0i8.p0i8.i{{[0-9]+}}(i8* %{{.+}}, i8* %{{.+}}, i{{[0-9]+}} [[SIZE]], i32 128, i1 false)
+#endif
+#endif
diff --git a/test/OpenMP/teams_private_codegen.cpp b/test/OpenMP/teams_private_codegen.cpp
new file mode 100644
index 0000000000000..1ba010fa5c3cc
--- /dev/null
+++ b/test/OpenMP/teams_private_codegen.cpp
@@ -0,0 +1,298 @@
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-64
+// RUN: %clang_cc1 -DLAMBDA -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1 -DLAMBDA -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix LAMBDA --check-prefix LAMBDA-32
+
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1  -verify -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -emit-pch -o %t %s
+// RUN: %clang_cc1  -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -fopenmp-targets=i386-pc-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+template <class T>
+struct S {
+  T f;
+  S(T a) : f(a) {}
+  S() : f() {}
+  operator T() { return T(); }
+  ~S() {}
+};
+
+volatile int g __attribute__((aligned(128))) = 1212;
+
+struct SS {
+  int a;
+  int b : 4;
+  int &c;
+  SS(int &d) : a(0), b(0), c(d) {
+#pragma omp target
+#pragma omp teams private(a, b, c)
+#ifdef LAMBDA
+    [&]() {
+      ++this->a, --b, (this)->c /= 1;
+    }();
+#else
+    ++this->a, --b, c /= 1;
+#endif
+  }
+};
+
+template<typename T>
+struct SST {
+  T a;
+  SST() : a(T()) {
+#pragma omp target
+#pragma omp teams private(a)
+#ifdef LAMBDA
+    [&]() {
+      [&]() {
+        ++this->a;
+      }();
+    }();
+#else
+    ++(this)->a;
+#endif
+  }
+};
+
+// CHECK: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[SS_TY:%.+]] = type { i{{[0-9]+}}, i8
+// LAMBDA: [[CAP_0_TY:%.+]] = type { [[SS_TY]]*, i{{[0-9]+}}*,
+// LAMBDA: [[CAP_1_TY:%.+]] = type { i{{[0-9]+}}*, i{{[0-9]+}}* }
+// CHECK: [[S_FLOAT_TY:%.+]] = type { float }
+// CHECK: [[S_INT_TY:%.+]] = type { i{{[0-9]+}} }
+// CHECK: [[SST_TY:%.+]] = type { i{{[0-9]+}} }
+template <typename T>
+T tmain() {
+  S<T> test;
+  SST<T> sst;
+  T t_var __attribute__((aligned(128))) = T();
+  T vec[] __attribute__((aligned(128))) = {1, 2};
+  S<T> s_arr[] __attribute__((aligned(128))) = {1, 2};
+  S<T> var __attribute__((aligned(128))) (3);
+#pragma omp target
+#pragma omp teams private(t_var, vec, s_arr, var)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+  }
+  return T();
+}
+
+int main() {
+  static int sivar;
+  SS ss(sivar);
+#ifdef LAMBDA
+  // LAMBDA: [[G:@.+]] = global i{{[0-9]+}} 1212,
+  // LAMBDA: define {{.+}} @main()
+  // LAMBDA: alloca [[SS_TY]],
+  // LAMBDA: alloca [[CAP_TY:%.+]],
+
+  // LAMBDA: call{{.*}} [[ST_CONSTR_INIT:@.+]]([[SS_TY]]*
+  // LAMBDA: call{{.*}} void [[OUTER_LAMBDA:@[^(]+]]([[CAP_TY]]*
+
+  // lambda and target region in main
+  // LAMBDA: define {{.+}} [[OUTER_LAMBDA]]([[CAP_TY]]* {{.+}})
+  // LAMBDA: call void @[[OMP_OFFLOADING:.+]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}} {{.+}}
+
+  // target region in struct constructor
+  // LAMBDA: define{{.*}} void [[ST_CONSTR:@.+]]([[SS_TY]]* %this,
+  // LAMBDA: call void [[OMP_OFFLOADING_1:@.+]]([[SS_TY]]
+
+  // offloading function in struct constructor
+  // LAMBDA: define{{.*}} void [[OMP_OFFLOADING_1]]([[SS_TY]]
+  // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[OMP_OUTLINED:@.+]] to void
+
+  // outlined teams region in struct constructor
+  // LAMBDA: define{{.*}} void [[OMP_OUTLINED]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}, [[SS_TY]]*
+  // LAMBDA: [[THIS_ADDR:%.+]] = alloca [[SS_TY]]*,
+  // LAMBDA: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+  // LAMBDA: [[THIS_REF:%.+]] = load [[SS_TY]]*, [[SS_TY]]** [[THIS_ADDR]],
+  // LAMBDA: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[A_TMP_REF:%.+]],
+  // LAMBDA: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[C_TMP_REF:%.+]],
+  // LAMBDA: [[CAP_THIS_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
+  // LAMBDA: store [[SS_TY]]* [[THIS_REF]], [[SS_TY]]** [[CAP_THIS_REF]],
+  // LAMBDA: [[CAP_A_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // LAMBDA: [[A_TMP_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_TMP_REF]],
+  // LAMBDA: store i{{[0-9]+}}* [[A_TMP_VAL]], i{{[0-9]+}}** [[CAP_A_REF]],
+  // LAMBDA: [[CAP_B_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // LAMBDA: store i{{[0-9]+}}* [[B_PRIV]], i{{[0-9]+}}** [[CAP_B_REF]],
+  // LAMBDA: [[CAP_C_REF:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // LAMBDA: [[C_TMP_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_TMP_REF]],
+  // LAMBDA: store i{{[0-9]+}}* [[C_TMP_VAL]], i{{[0-9]+}}** [[CAP_C_REF]],
+  // call void [[INNER_LAMBDA_CONSTR:@.+]]([[CAP_0_TY]]*
+  
+  // inner lambda in struct constructor
+  // define{{.*}} void [[INNER_LAMBDA_CONSTR]]([[CAP_0_TY]]*
+  // LAMBDA: [[CAP_A_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 1
+  // LAMBDA: [[A_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_A_REF_1]],
+  // LAMBDA: [[A_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_FROM_CAP]],
+  // LAMBDA: [[A_INC_VAL:%.+]] = add {{.+}} i{{[0-9]+}} [[A_VAL_FROM_CAP]], 1
+  // LAMBDA: store i{{[0-9]+}} [[A_INC_VAL]], i{{[0-9]+}}* [[A_REF_FROM_CAP]],
+
+  // LAMBDA: [[CAP_B_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 2
+  // LAMBDA: [[B_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_B_REF_1]],
+  // LAMBDA: [[B_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_REF_FROM_CAP]],
+  // LAMBDA: [[B_DEC_VAL:%.+]] = add {{.+}} i{{[0-9]+}} [[B_VAL_FROM_CAP]], -1
+  // LAMBDA: store i{{[0-9]+}} [[B_DEC_VAL]], i{{[0-9]+}}* [[B_REF_FROM_CAP]],
+
+  // LAMBDA: [[CAP_C_REF_1:%.+]] = getelementptr {{.+}} [[CAP_0_TY]], [[CAP_0_TY]]* {{.+}}, i{{[0-9]+}} 0, i{{[0-9]+}} 3
+  // LAMBDA: [[C_REF_FROM_CAP:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[CAP_C_REF_1]],
+  // LAMBDA: [[C_VAL_FROM_CAP:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_REF_FROM_CAP]],
+  // LAMBDA: [[C_DEC_VAL:%.+]] = sdiv{{.*}} i{{[0-9]+}} [[C_VAL_FROM_CAP]], 1
+  // LAMBDA: store i{{[0-9]+}} [[C_DEC_VAL]], i{{[0-9]+}}* [[C_REF_FROM_CAP]],
+  // ret
+    
+  [&]() {    
+#pragma omp target
+#pragma omp teams private(g, sivar)
+  {
+    // LAMBDA: define{{.+}} @[[OMP_OFFLOADING]](i{{[0-9]+}}* {{.+}} [[G_IN:%.+]], i{{[0-9]+}} [[SIVAR_IN:%.+]]
+    // LAMBDA: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_1:@.+]] to void
+    
+    // LAMBDA: define {{.+}} [[OMP_OUTLINED_1]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}
+    // LAMBDA: [[G_LOC_OUTER:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: [[SIVAR_LOC_OUTER:%.+]] = alloca i{{[0-9]+}},
+    // LAMBDA: store i{{[0-9]+}} 1, i{{[0-9]+}}* [[G_LOC_OUTER]]
+    // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}* [[SIVAR_LOC_OUTER]]
+    // LAMBDA: call{{.*}} void [[INNER_LAMBDA:@[^(]+]]([[CAP_1_TY]]*
+    // LAMBDA: ret
+    g = 1;
+    sivar = 2;
+    [&]() {
+      // LAMBDA: define {{.+}} [[INNER_LAMBDA]]([[CAP_1_TY]]* {{.+}})
+      g = 2;
+      sivar = 4;
+      // LAMBDA: store i{{[0-9]+}} 2, i{{[0-9]+}}*
+      // LAMBDA: store i{{[0-9]+}} 4, i{{[0-9]+}}*
+    }();
+  }
+  }();
+  return 0;
+#else
+  S<float> test;
+  int t_var = 0;
+  int vec[] = {1, 2};
+  S<float> s_arr[] = {1, 2};
+  S<float> var(3);
+#pragma omp target
+#pragma omp teams private(t_var, vec, s_arr, var, sivar)
+  {
+    vec[0] = t_var;
+    s_arr[0] = var;
+    sivar = 3;
+  }
+  return tmain<int>();
+#endif
+}
+
+// CHECK: define{{.*}} i{{[0-9]+}} @main()
+// CHECK: [[TEST:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR:@.+]]([[S_FLOAT_TY]]* [[TEST]])
+// CHECK: [[OFF_IN:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* {{%.+}},
+// CHECK: call void @[[OMP_OFFLOADING:.+]](i{{[0-9]+}} [[OFF_IN]]
+// CHECK: = call{{.*}} i{{.+}} [[TMAIN_INT:@.+]]()
+// CHECK: call void [[S_FLOAT_TY_DESTR:@.+]]([[S_FLOAT_TY]]*
+// CHECK: ret
+
+// target region in main function
+// CHECK: define{{.+}} @[[OMP_OFFLOADING]](i{{[0-9]+}}
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED:@.+]] to void
+
+// CHECK: define internal void [[OMP_OUTLINED]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}],
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_FLOAT_TY]]],
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_FLOAT_TY]],
+// CHECK: [[SIVAR_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_FLOAT_TY]]*
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_FLOAT_TY_DEF_CONSTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_FLOAT_TY_DESTR]]([[S_FLOAT_TY]]*
+// CHECK: ret void
+
+// template tmain
+// CHECK: define{{.*}} i{{[0-9]+}} [[TMAIN_INT]]()
+// CHECK: [[TEST:%.+]] = alloca [[S_INT_TY]],
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR:@.+]]([[S_INT_TY]]* [[TEST]])
+// CHECK: call void [[S_INT_TY_CONSTR:@.+]]([[S_INT_TY]]* {{.+}}, i{{[0-9]+}}{{.*}} 3)
+// CHECK: call void [[OMP_OFFLOADING_TMAIN:@.+]]()
+
+// target in SS constructor
+// CHECK: define{{.+}} [[OMP_OFFLOADING_SS:@.+]]([[SS_TY]]*
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SS_TY]]*)* [[OMP_OUTLINED_SS:@.+]] to void
+
+// CHECK: define{{.*}} void [[OMP_OUTLINED_SS]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* {{.+}}, [[SS_TY]]*
+// CHECK: [[A_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[B_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: [[C_PRIV:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV]], i{{[0-9]+}}** [[A_REF:%.+]],
+// CHECK: store i{{[0-9]+}}* [[C_PRIV]], i{{[0-9]+}}** [[C_REF:%.+]],
+// CHECK: [[A_REF_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_REF]]
+// CHECK: [[A_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_VAL]]
+// CHECK: [[A_INC:%.+]] = add{{.*}} i{{[0-9]+}} [[A_VAL]], 1
+// CHECK: store i{{[0-9]+}} [[A_INC]], i{{[0-9]+}}* [[A_REF_VAL]],
+// CHECK: [[B_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[B_PRIV]]
+// CHECK: [[B_DEC:%.+]] = add{{.*}} i{{[0-9]+}} [[B_VAL]], -1
+// CHECK: store i{{[0-9]+}} [[B_DEC]], i{{[0-9]+}}* [[B_PRIV]],
+// CHECK: [[C_REF_VAL:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[C_REF]]
+// CHECK: [[C_VAL:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[C_REF_VAL]]
+// CHECK: [[C_DIV:%.+]] = sdiv i{{[0-9]+}} [[C_VAL]], 1
+// CHECK: store i{{[0-9]+}} [[C_DIV]], i{{[0-9]+}}* [[C_REF_VAL]],
+// CHECK: ret
+
+// target in tmain template
+// CHECK: define{{.+}} [[OMP_OFFLOADING_TMAIN]]()
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 0, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*)* [[OMP_OUTLINED_TMAIN:@.+]] to void
+
+// CHECK: define{{.*}} void [[OMP_OUTLINED_TMAIN]](i{{[0-9]+}}* noalias [[GTID_ADDR:%.+]], i{{[0-9]+}}* noalias %{{.+}})
+// CHECK: [[T_VAR_PRIV:%.+]] = alloca i{{[0-9]+}}, align 128
+// CHECK: [[VEC_PRIV:%.+]] = alloca [2 x i{{[0-9]+}}], align 128
+// CHECK: [[S_ARR_PRIV:%.+]] = alloca [2 x [[S_INT_TY]]], align 128
+// CHECK: [[VAR_PRIV:%.+]] = alloca [[S_INT_TY]], align 128
+// CHECK: store i{{[0-9]+}}* [[GTID_ADDR]], i{{[0-9]+}}** [[GTID_ADDR_REF:%.+]]
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: {{.+}}:
+// CHECK: [[S_ARR_PRIV_ITEM:%.+]] = phi [[S_INT_TY]]*
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[S_ARR_PRIV_ITEM]])
+// CHECK-NOT: [[T_VAR_PRIV]]
+// CHECK-NOT: [[VEC_PRIV]]
+// CHECK: call {{.*}} [[S_INT_TY_DEF_CONSTR]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR:@.+]]([[S_INT_TY]]* [[VAR_PRIV]])
+// CHECK-DAG: call void [[S_INT_TY_DESTR]]([[S_INT_TY]]*
+// CHECK: ret
+
+// SST constructor
+// CHECK: define{{.+}} [[SST_CONST:@.+]]([[SST_TY]]* {{.+}})
+// CHECK: call void [[OMP_OFFLOADING_SST:@.+]]([[SST_TY]]* {{.+}})
+
+// target in SST constructor
+// CHECK: define{{.+}} [[OMP_OFFLOADING_SST]]([[SST_TY]]* {{.+}})
+// CHECK: call void (%{{.+}}*, i{{[0-9]+}}, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)*, ...) @__kmpc_fork_teams(%{{.+}}* @{{.+}}, i{{[0-9]+}} 1, void (i{{[0-9]+}}*, i{{[0-9]+}}*, ...)* bitcast (void (i{{[0-9]+}}*, i{{[0-9]+}}*, [[SST_TY]]*)* [[OMP_OUTLINED_SST:@.+]] to void
+
+// CHECK: define{{.+}} [[OMP_OUTLINED_SST]](i{{[0-9]+}}* {{.+}}, i{{[0-9]+}}* noalias %{{.+}}, [[SST_TY]]* {{.+}})
+// CHECK: [[A_PRIV_1:%.+]] = alloca i{{[0-9]+}},
+// CHECK: store i{{[0-9]+}}* [[A_PRIV_1]], i{{[0-9]+}}** [[A_REF_1:%.+]],
+// CHECK: [[A_REF_VAL_1:%.+]] = load i{{[0-9]+}}*, i{{[0-9]+}}** [[A_REF_1]]
+// CHECK: [[A_VAL_1:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[A_REF_VAL_1]]
+// CHECK: [[A_INC_1:%.+]] = add{{.*}} i{{[0-9]+}} [[A_VAL_1]], 1
+// CHECK: store i{{[0-9]+}} [[A_INC_1]], i{{[0-9]+}}* [[A_REF_VAL_1]],
+// CHECK: ret
+
+#endif
+
diff --git a/test/OpenMP/teams_reduction_messages.cpp b/test/OpenMP/teams_reduction_messages.cpp
index 87d03485c17b1..0420b010bb65b 100644
--- a/test/OpenMP/teams_reduction_messages.cpp
+++ b/test/OpenMP/teams_reduction_messages.cpp
@@ -13,7 +13,7 @@ struct S1; // expected-note {{declared here}} expected-note 4 {{forward declarat
 extern S1 a;
 class S2 {
   mutable int a;
-  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 4 {{implicitly declared private here}}
+  S2 &operator+(const S2 &arg) { return (*this); } // expected-note 3 {{implicitly declared private here}}
 
 public:
   S2() : a(0) {}
@@ -22,7 +22,7 @@ public:
   static const float S2sc;
 };
 const float S2::S2sc = 0; // expected-note 2 {{'S2sc' defined here}}
-S2 b;                     // expected-note 2 {{'b' defined here}}
+S2 b;                     // expected-note 3 {{'b' defined here}}
 const S2 ba[5];           // expected-note 2 {{'ba' defined here}}
 class S3 {
   int a;
@@ -34,7 +34,7 @@ public:
   S3 operator+(const S3 &arg1) { return arg1; }
 };
 int operator+(const S3 &arg1, const S3 &arg2) { return 5; }
-S3 c;               // expected-note 2 {{'c' defined here}}
+S3 c;               // expected-note 3 {{'c' defined here}}
 const S3 ca[5];     // expected-note 2 {{'ca' defined here}}
 extern const int f; // expected-note 4 {{'f' declared here}}
 class S4 {
@@ -56,9 +56,9 @@ class S5 {
 public:
   S5(int v) : a(v) {}
 };
-class S6 { // expected-note 2 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
+class S6 { // expected-note 3 {{candidate function (the implicit copy assignment operator) not viable: no known conversion from 'int' to 'const S6' for 1st argument}}
 #if __cplusplus >= 201103L // C++11 or later
-// expected-note@-2 2 {{candidate function (the implicit move assignment operator) not viable}}
+// expected-note@-2 3 {{candidate function (the implicit move assignment operator) not viable}}
 #endif
   int a;
 
@@ -112,7 +112,7 @@ T tmain(T argc) {
 #pragma omp teams reduction(|| : argc ? i : argc) // expected-error 2 {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max'}}
+#pragma omp teams reduction(foo : argc) //expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'float'}} expected-error {{incorrect reduction identifier, expected one of '+', '-', '*', '&', '|', '^', '&&', '||', 'min' or 'max' or declare reduction for type 'int'}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(&& : argc)
@@ -121,22 +121,22 @@ T tmain(T argc) {
 #pragma omp teams reduction(^ : T) // expected-error {{'T' does not refer to a value}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 3 {{'operator+' is a private member of 'S2'}}
+#pragma omp teams reduction(+ : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 3 {{const-qualified list item cannot be reduction}} expected-error 2 {{'operator+' is a private member of 'S2'}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 2 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
+#pragma omp teams reduction(min : a, b, c, d, f) // expected-error {{a reduction list item with incomplete type 'S1'}} expected-error 4 {{arguments of OpenMP clause 'reduction' for 'min' or 'max' must be of arithmetic type}} expected-error 3 {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp teams reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp teams reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}} expected-error {{a reduction list item with array type 'const float [5]'}}
+#pragma omp teams reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}} expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -151,7 +151,7 @@ T tmain(T argc) {
 #pragma omp teams reduction(+ : h, k) // expected-error {{threadprivate or thread local variable cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : o) // expected-error {{no viable overloaded '='}}
+#pragma omp teams reduction(+ : o) // expected-error 2 {{no viable overloaded '='}}
   foo();
 #pragma omp target
 #pragma omp teams private(i), reduction(+ : j), reduction(+ : q) // expected-error 4 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
@@ -161,7 +161,7 @@ T tmain(T argc) {
 #pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 2 {{argument of OpenMP clause 'reduction' must reference the same object in all threads}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 3 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 3 {{previously referenced here}}
+#pragma omp teams reduction(+ : p), reduction(+ : p) // expected-error 2 {{variable can appear only once in OpenMP 'reduction' clause}} expected-note 2 {{previously referenced here}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(+ : r) // expected-error 2 {{const-qualified list item cannot be reduction}}
@@ -175,6 +175,7 @@ T tmain(T argc) {
 #pragma omp teams
 #pragma omp parallel for private(fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -182,6 +183,7 @@ T tmain(T argc) {
 #pragma omp teams
 #pragma omp parallel for reduction(- : fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -258,13 +260,13 @@ int main(int argc, char **argv) {
 #pragma omp teams reduction(max : h.b) // expected-error {{expected variable name, array element or array section}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(+ : ba) // expected-error {{a reduction list item with array type 'const S2 [5]'}}
+#pragma omp teams reduction(+ : ba) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(* : ca) // expected-error {{a reduction list item with array type 'const S3 [5]'}}
+#pragma omp teams reduction(* : ca) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
-#pragma omp teams reduction(- : da) // expected-error {{a reduction list item with array type 'const int [5]'}}
+#pragma omp teams reduction(- : da) // expected-error {{const-qualified list item cannot be reduction}}
   foo();
 #pragma omp target
 #pragma omp teams reduction(^ : fl) // expected-error {{invalid operands to binary expression ('float' and 'float')}}
@@ -306,6 +308,7 @@ int main(int argc, char **argv) {
 #pragma omp teams
 #pragma omp parallel for private(fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
@@ -313,6 +316,7 @@ int main(int argc, char **argv) {
 #pragma omp teams
 #pragma omp parallel for reduction(- : fl)
   for (int i = 0; i < 10; ++i)
+  {}
 #pragma omp target
 #pragma omp teams reduction(+ : fl)
     foo();
diff --git a/test/OpenMP/threadprivate_ast_print.cpp b/test/OpenMP/threadprivate_ast_print.cpp
index 2d876c1909fa9..f789c2f277d34 100644
--- a/test/OpenMP/threadprivate_ast_print.cpp
+++ b/test/OpenMP/threadprivate_ast_print.cpp
@@ -69,4 +69,5 @@ int main () {
   return (foo<int>());
 }
 
+extern template int ST<int>::m;
 #endif
diff --git a/test/OpenMP/threadprivate_codegen.cpp b/test/OpenMP/threadprivate_codegen.cpp
index 53d7ef7083850..273f0488220ce 100644
--- a/test/OpenMP/threadprivate_codegen.cpp
+++ b/test/OpenMP/threadprivate_codegen.cpp
@@ -221,7 +221,7 @@ static S1 gs1(5);
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
 // CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
-// CHECK-DEBUG-NEXT: call {{.*}} [[S1_CTOR:@.+]]([[S1]]* [[RES]], {{.*}} 5)
+// CHECK-DEBUG-NEXT: call {{.*}} [[S1_CTOR:@.+]]([[S1]]* [[RES]], {{.*}} 5){{.*}}, !dbg
 // CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      ret i8* [[ARG]]
 // CHECK-DEBUG-NEXT: }
@@ -230,7 +230,7 @@ static S1 gs1(5);
 // CHECK-DEBUG:      store i8* %0, i8** [[ARG_ADDR:%.*]],
 // CHECK-DEBUG:      [[ARG:%.+]] = load i8*, i8** [[ARG_ADDR]]
 // CHECK-DEBUG:      [[RES:%.*]] = bitcast i8* [[ARG]] to [[S1]]*
-// CHECK-DEBUG-NEXT: call {{.*}} [[S1_DTOR:@.+]]([[S1]]* [[RES]])
+// CHECK-DEBUG-NEXT: call {{.*}} [[S1_DTOR:@.+]]([[S1]]* [[RES]]){{.*}}, !dbg
 // CHECK-DEBUG-NEXT: ret void
 // CHECK-DEBUG-NEXT: }
 // CHECK-DEBUG:      define {{.*}} [[S1_DTOR]]([[S1]]* {{.*}})
@@ -617,7 +617,7 @@ int main() {
 // CHECK-DEBUG:      call {{.*}} [[SMAIN_DTOR:@.+]]([[SMAIN]]*
 // CHECK-DEBUG:      }
 // CHECK-DEBUG:      define {{.*}} [[SMAIN_DTOR]]([[SMAIN]]* {{.*}})
-// CHECK-TLS:      define internal [[S1]]* [[GS1_TLS_INITD]] {
+// CHECK-TLS:      define internal [[S1]]* [[GS1_TLS_INITD]] {{#[0-9]+}} {
 // CHECK-TLS-NEXT: call void [[GS1_TLS_INIT]]
 // CHECK-TLS-NEXT: ret [[S1]]* [[GS1]]
 // CHECK-TLS-NEXT: }
@@ -639,15 +639,15 @@ int main() {
 // CHECK-TLS:   call void [[ARR_X_TLS_INIT]]
 // CHECK-TLS:   ret [2 x [3 x [[S1]]]]* [[ARR_X]]
 // CHECK-TLS: }
-// CHECK-TLS: define {{.*}} i32* [[ST_INT_ST_TLS_INITD]] {
+// CHECK-TLS: define {{.*}} i32* [[ST_INT_ST_TLS_INITD]] {{#[0-9]+}} {
 // CHECK-TLS:   call void [[ST_INT_ST_TLS_INIT]]
 // CHECK-TLS:   ret i32* [[ST_INT_ST]]
 // CHECK-TLS: }
-// CHECK-TLS: define {{.*}} float* [[ST_FLOAT_ST_TLS_INITD]] {
+// CHECK-TLS: define {{.*}} float* [[ST_FLOAT_ST_TLS_INITD]] {{#[0-9]+}} {
 // CHECK-TLS:   call void [[ST_FLOAT_ST_TLS_INIT]]
 // CHECK-TLS:   ret float* [[ST_FLOAT_ST]]
 // CHECK-TLS: }
-// CHECK-TLS: define {{.*}} [[S4]]* [[ST_S4_ST_TLS_INITD]] {
+// CHECK-TLS: define {{.*}} [[S4]]* [[ST_S4_ST_TLS_INITD]] {{#[0-9]+}} {
 // CHECK-TLS:   call void [[ST_S4_ST_TLS_INIT]]
 // CHECK-TLS:   ret [[S4]]* [[ST_S4_ST]]
 // CHECK-TLS: }
@@ -948,5 +948,5 @@ int foobar() {
 // CHECK-TLS:      call void [[ST_S4_ST_CXX_INIT]]
 // CHECK-TLS:      [[DONE_LABEL]]
 
-// CHECK-TLS:      declare {{.*}} void [[GS3_TLS_INIT]]
-// CHECK-TLS:      declare {{.*}} void [[STATIC_S_TLS_INIT]]
+// CHECK-TLS-DAG:      declare {{.*}} void [[GS3_TLS_INIT]]
+// CHECK-TLS-DAG:      declare {{.*}} void [[STATIC_S_TLS_INIT]]
diff --git a/test/OpenMP/threadprivate_messages.cpp b/test/OpenMP/threadprivate_messages.cpp
index 8c442f47ad5ee..9775bfa458f49 100644
--- a/test/OpenMP/threadprivate_messages.cpp
+++ b/test/OpenMP/threadprivate_messages.cpp
@@ -70,7 +70,7 @@ class TestClass {
 
 namespace ns {
   int m;
-#pragma omp threadprivate (m)
+#pragma omp threadprivate (m, m)
 }
 #pragma omp threadprivate (m) // expected-error {{use of undeclared identifier 'm'}}
 #pragma omp threadprivate (ns::m)
diff --git a/test/PCH/Inputs/__va_list_tag-typedef.h b/test/PCH/Inputs/__va_list_tag-typedef.h
new file mode 100644
index 0000000000000..33dc6ad484256
--- /dev/null
+++ b/test/PCH/Inputs/__va_list_tag-typedef.h
@@ -0,0 +1,4 @@
+// Header for PCH test __va_list_tag-typedef.c
+
+#include <stdarg.h>
+typedef va_list va_list_1;
diff --git a/test/PCH/Inputs/cxx11-statement-attributes.h b/test/PCH/Inputs/cxx11-statement-attributes.h
index f4d0619a3f98a..3f85e1fa2b062 100644
--- a/test/PCH/Inputs/cxx11-statement-attributes.h
+++ b/test/PCH/Inputs/cxx11-statement-attributes.h
@@ -7,7 +7,8 @@ int f(int n) {
       [[clang::fallthrough]];  // This shouldn't generate a warning.
     case 1:
       n += 20;
-      [[clang::fallthrough]];  // This should generate a warning: "fallthrough annotation does not directly precede switch label".
+    case 2:  // This should generate a warning: "unannotated fallthrough"
+      n += 35;
       break;
   }
   return n;
diff --git a/test/PCH/libroot/usr/include/reloc.h b/test/PCH/Inputs/libroot/usr/include/reloc.h
index 04eeacba8ff55..04eeacba8ff55 100644
--- a/test/PCH/libroot/usr/include/reloc.h
+++ b/test/PCH/Inputs/libroot/usr/include/reloc.h
diff --git a/test/PCH/libroot/usr/include/reloc2.h b/test/PCH/Inputs/libroot/usr/include/reloc2.h
index 995415ce95bc2..995415ce95bc2 100644
--- a/test/PCH/libroot/usr/include/reloc2.h
+++ b/test/PCH/Inputs/libroot/usr/include/reloc2.h
diff --git a/test/PCH/Inputs/pr27445.h b/test/PCH/Inputs/pr27445.h
new file mode 100644
index 0000000000000..f78a1bcf859f8
--- /dev/null
+++ b/test/PCH/Inputs/pr27445.h
@@ -0,0 +1,4 @@
+struct Info {
+  virtual ~Info();
+  void hash() {}
+};
diff --git a/test/PCH/Inputs/pragma-once2-pch.h b/test/PCH/Inputs/pragma-once2-pch.h
new file mode 100644
index 0000000000000..642b50f15aa7f
--- /dev/null
+++ b/test/PCH/Inputs/pragma-once2-pch.h
@@ -0,0 +1 @@
+#include "pragma-once2.h"
diff --git a/test/PCH/Inputs/pragma-once2.h b/test/PCH/Inputs/pragma-once2.h
new file mode 100644
index 0000000000000..7ed4a95520e20
--- /dev/null
+++ b/test/PCH/Inputs/pragma-once2.h
@@ -0,0 +1,3 @@
+#pragma once
+
+inline void f() {}
diff --git a/test/PCH/__va_list_tag-typedef.c b/test/PCH/__va_list_tag-typedef.c
new file mode 100644
index 0000000000000..c3745ca2cb8c8
--- /dev/null
+++ b/test/PCH/__va_list_tag-typedef.c
@@ -0,0 +1,14 @@
+// This test checks the patch for the compilation error / crash described in D18557.
+
+// Test as a C source
+// RUN: %clang_cc1 -emit-pch -x c-header -o %t %S/Inputs/__va_list_tag-typedef.h
+// RUN: %clang_cc1 -fsyntax-only -include-pch %t %s
+
+// Test as a C++ source
+// RUN: %clang_cc1 -emit-pch -x c++-header -o %t %S/Inputs/__va_list_tag-typedef.h
+// RUN: %clang_cc1 -x c++ -fsyntax-only -include-pch %t %s
+
+// expected-no-diagnostics
+
+typedef __builtin_va_list va_list_2;
+void test(const char* format, ...) { va_list args; va_start( args, format ); }
diff --git a/test/PCH/attrs.c b/test/PCH/attrs.c
index 6a4b8f667cbc8..3f34d4d00908e 100644
--- a/test/PCH/attrs.c
+++ b/test/PCH/attrs.c
@@ -9,10 +9,12 @@
 #define HEADER
 
 int f(int) __attribute__((visibility("default"), overloadable));
+int g(int) __attribute__((abi_tag("foo", "bar", "baz"), no_sanitize("address", "memory")));
 
 #else
 
 double f(double); // expected-error{{overloadable}}
                   // expected-note@11{{previous overload}}
+void h() { g(0); }
 
 #endif
diff --git a/test/PCH/case-insensitive-include.c b/test/PCH/case-insensitive-include.c
index 707de702f15dc..1dcda273c2f23 100644
--- a/test/PCH/case-insensitive-include.c
+++ b/test/PCH/case-insensitive-include.c
@@ -2,7 +2,7 @@
 
 // Test this without pch.
 // RUN: cp %S/Inputs/case-insensitive-include.h %T
-// RUN: %clang_cc1 -fsyntax-only %s -include %s -I %T -verify
+// RUN: %clang_cc1 -Wno-nonportable-include-path -fsyntax-only %s -include %s -I %T -verify
 
 // Test with pch.
 // RUN: %clang_cc1 -emit-pch -o %t.pch %s -I %T
diff --git a/test/PCH/chain-invalid-code.cpp b/test/PCH/chain-invalid-code.cpp
new file mode 100644
index 0000000000000..9de88f0cee77b
--- /dev/null
+++ b/test/PCH/chain-invalid-code.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 -fsyntax-only %s -chain-include %s -Wuninitialized -Wunused -verify
+
+// Make sure there is no crash.
+
+#ifndef HEADER
+#define HEADER
+
+#include "non-existent-header.h"
+
+class A {
+public:
+  ~A();
+};
+
+class ForwardCls;
+struct B {
+  ForwardCls f;
+  A a;
+};
+
+#else
+
+static void test() {
+  int x; // expected-warning {{unused}}
+  B b;
+}
+
+#endif
diff --git a/test/PCH/cxx-traits.cpp b/test/PCH/cxx-traits.cpp
index fc3e1335dd60c..b0f1d9d2c3deb 100644
--- a/test/PCH/cxx-traits.cpp
+++ b/test/PCH/cxx-traits.cpp
@@ -18,6 +18,7 @@ bool copy_construct_int = n::is_trivially_constructible<int, const int&>::value;
 bool _is_abstract_result = __is_abstract(int);
 bool _is_arithmetic_result = __is_arithmetic(int);
 bool _is_array_result = __is_array(int);
+bool _is_assignable_result = __is_assignable(int, int);
 bool _is_base_of_result = __is_base_of(int, int);
 bool _is_class_result = __is_class(int);
 bool _is_complete_type_result = __is_complete_type(int);
diff --git a/test/PCH/cxx-traits.h b/test/PCH/cxx-traits.h
index 21324768186ee..1d7d40450fe2d 100644
--- a/test/PCH/cxx-traits.h
+++ b/test/PCH/cxx-traits.h
@@ -20,6 +20,7 @@ struct is_trivially_constructible {
 struct __is_abstract {};  // expected-warning {{made available}}
 struct __is_arithmetic {};  // expected-warning {{made available}}
 struct __is_array {};  // expected-warning {{made available}}
+struct __is_assignable {};  // expected-warning {{made available}}
 struct __is_base_of {};  // expected-warning {{made available}}
 struct __is_class {};  // expected-warning {{made available}}
 struct __is_complete_type {};  // expected-warning {{made available}}
diff --git a/test/PCH/cxx-variadic-templates-with-default-params.cpp b/test/PCH/cxx-variadic-templates-with-default-params.cpp
new file mode 100644
index 0000000000000..2c1482091db35
--- /dev/null
+++ b/test/PCH/cxx-variadic-templates-with-default-params.cpp
@@ -0,0 +1,26 @@
+// Test this without pch.
+// RUN: %clang_cc1 -std=c++11 -include %s -fsyntax-only -verify %s
+
+// Test with pch.
+// RUN: %clang_cc1 -std=c++11 -x c++-header -emit-pch -o %t %s
+// RUN: %clang_cc1 -std=c++11 -include-pch %t -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+// PR25271: Ensure that default template arguments prior to a parameter pack
+// successfully round-trip.
+#ifndef HEADER
+#define HEADER
+template<unsigned T=123, unsigned... U>
+class dummy;
+
+template<unsigned T, unsigned... U>
+class dummy {
+    int field[T];
+};
+#else
+void f() {
+    dummy<> x;
+    (void)x;
+}
+#endif
diff --git a/test/PCH/cxx11-inheriting-ctors.cpp b/test/PCH/cxx11-inheriting-ctors.cpp
index 79f78ba6c1262..bf9a2b7609dfd 100644
--- a/test/PCH/cxx11-inheriting-ctors.cpp
+++ b/test/PCH/cxx11-inheriting-ctors.cpp
@@ -1,10 +1,19 @@
-// RUN: %clang_cc1 -std=c++11 -emit-pch -o %t %s
-// RUN: %clang_cc1 -std=c++11 -include-pch %t -verify %s
+// RUN: %clang_cc1 -std=c++11 -include %s -include %s -verify %s
+//
+// Emit with definitions in the declaration:
+// RxN: %clang_cc1 -std=c++11 -emit-pch -o %t.12 -include %s %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.12 -verify %s
+//
+// Emit with definitions in update records:
+// RxN: %clang_cc1 -std=c++11 -emit-pch -o %t.1 %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.1 -emit-pch -o %t.2 -verify %s
+// RxN: %clang_cc1 -std=c++11 -include-pch %t.1 -include-pch %t.2 -verify %s
+
 
 // expected-no-diagnostics
 
-#ifndef HEADER_INCLUDED
-#define HEADER_INCLUDED
+#ifndef HEADER1
+#define HEADER1
 
 struct Base {
   Base(int) {}
@@ -27,7 +36,8 @@ struct Test3 : B {
   using B::B;
 };
 
-#else
+#elif !defined(HEADER2)
+#define HEADER2
 
 Test test1a(42);
 Test test1b(nullptr);
@@ -36,4 +46,16 @@ Test2<int> test2b(nullptr);
 Test3<Base> test3a(42);
 Test3<Base> test3b(nullptr);
 
-#endif // HEADER_INCLUDED
+#pragma clang __debug dump Test
+#pragma clang __debug dump Test2
+
+#else
+
+Test retest1a(42);
+Test retest1b(nullptr);
+Test2<int> retest2a(42);
+Test2<int> retest2b(nullptr);
+Test3<Base> retest3a(42);
+Test3<Base> retest3b(nullptr);
+
+#endif
diff --git a/test/PCH/cxx11-statement-attributes.cpp b/test/PCH/cxx11-statement-attributes.cpp
index 722ca6e9ffa32..b5dfc6cd4c0fc 100644
--- a/test/PCH/cxx11-statement-attributes.cpp
+++ b/test/PCH/cxx11-statement-attributes.cpp
@@ -1,10 +1,15 @@
 // Sanity check.
 // RUN: %clang_cc1 -include %S/Inputs/cxx11-statement-attributes.h -std=c++11 -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
+// RUN: %clang_cc1 -include %S/Inputs/cxx11-statement-attributes.h -std=c++1z -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
 // Run the same tests, this time with the attributes loaded from the PCH file.
 // RUN: %clang_cc1 -x c++-header -emit-pch -std=c++11 -o %t %S/Inputs/cxx11-statement-attributes.h
 // RUN: %clang_cc1 -include-pch %t -std=c++11 -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
+// RUN: %clang_cc1 -x c++-header -emit-pch -std=c++1z -o %t %S/Inputs/cxx11-statement-attributes.h
+// RUN: %clang_cc1 -include-pch %t -std=c++1z -Wimplicit-fallthrough -fsyntax-only %s -o - -verify
 
-// expected-warning@Inputs/cxx11-statement-attributes.h:10 {{fallthrough annotation does not directly precede switch label}}
+// expected-warning@Inputs/cxx11-statement-attributes.h:10 {{unannotated fall-through}}
+// expected-note-re@Inputs/cxx11-statement-attributes.h:10 {{insert '[[{{(clang::)?}}fallthrough]];'}}
+// expected-note@Inputs/cxx11-statement-attributes.h:10 {{insert 'break;'}}
 
 void g(int n) {
   f<1>(n);  // expected-note {{in instantiation of function template specialization 'f<1>' requested here}}
diff --git a/test/PCH/cxx1z-init-statement.cpp b/test/PCH/cxx1z-init-statement.cpp
new file mode 100644
index 0000000000000..d08fb7c56b712
--- /dev/null
+++ b/test/PCH/cxx1z-init-statement.cpp
@@ -0,0 +1,17 @@
+// Test this without pch.
+// RUN: %clang_cc1 -std=c++1z -include %S/cxx1z-init-statement.h -fsyntax-only -emit-llvm -o - %s
+
+// Test with pch.
+// RUN: %clang_cc1 -x c++ -std=c++1z -emit-pch -o %t %S/cxx1z-init-statement.h
+// RUN: %clang_cc1 -std=c++1z -include-pch %t -fsyntax-only -emit-llvm -o - %s 
+
+void g0(void) {
+  static_assert(test_if(-1) == -1, "");
+  static_assert(test_if(0) == 0, "");
+}
+
+void g1(void) {
+  static_assert(test_switch(-1) == -1, "");
+  static_assert(test_switch(0) == 0, "");
+  static_assert(test_switch(1) == 1, "");
+}
diff --git a/test/PCH/cxx1z-init-statement.h b/test/PCH/cxx1z-init-statement.h
new file mode 100644
index 0000000000000..16bd569c848c4
--- /dev/null
+++ b/test/PCH/cxx1z-init-statement.h
@@ -0,0 +1,22 @@
+// Header for PCH test cxx1z-init-statement.cpp
+
+constexpr int test_if(int x) { 
+  if (int a = ++x; a == 0) {
+    return -1;
+  } else if (++a; a == 2) {
+    return 0;
+  }
+  return 2;
+}
+
+constexpr int test_switch(int x) {
+  switch (int a = ++x; a) {
+    case 0:
+      return -1;
+    case 1:
+      return 0;
+    case 2:
+      return 1;
+  }
+  return 2;
+}
diff --git a/test/PCH/debug-info-pch-path.c b/test/PCH/debug-info-pch-path.c
new file mode 100644
index 0000000000000..402d44da918d3
--- /dev/null
+++ b/test/PCH/debug-info-pch-path.c
@@ -0,0 +1,76 @@
+// REQUIRES: shell
+//
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: cd %t
+//
+// ---------------------------------------------------------------------
+// Relative PCH, same directory.
+// ---------------------------------------------------------------------
+//
+// RUN: %clang_cc1 -fmodule-format=obj -emit-pch                \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -o prefix.pch %S/debug-info-limited-struct.h
+//
+// RUN: %clang_cc1 -debug-info-kind=standalone                  \
+// RUN:     -dwarf-ext-refs -fmodule-format=obj                 \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -include-pch prefix.pch %s -emit-llvm -o %t.nodir.ll %s
+// RUN: cat %t.nodir.ll | FileCheck %s --check-prefix=CHECK-REL-NODIR
+//
+//
+// CHECK-REL-NODIR: !DICompileUnit
+// CHECK-REL-NODIR-SAME:           file: ![[C:[0-9]+]]
+// CHECK-REL-NODIR-NOT: dwoId
+// CHECK-REL-NODIR: ![[C]] = !DIFile({{.*}}directory: "[[DIR:.*]]"
+// CHECK-REL-NODIR: !DICompileUnit(
+// CHECK-REL-NODIR-SAME:           file: ![[PCH:[0-9]+]]
+// CHECK-REL-NODIR-SAME:           splitDebugFilename: "prefix.pch"
+// CHECK-REL-NODIR: ![[PCH]] = !DIFile({{.*}}directory: "[[DIR]]"
+
+// ---------------------------------------------------------------------
+// Relative PCH in a subdirectory.
+// ---------------------------------------------------------------------
+//
+// RUN: mkdir pchdir
+// RUN: %clang_cc1 -fmodule-format=obj -emit-pch                \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -o pchdir/prefix.pch %S/debug-info-limited-struct.h
+//
+// RUN: %clang_cc1 -debug-info-kind=standalone                  \
+// RUN:     -dwarf-ext-refs -fmodule-format=obj                 \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -include-pch pchdir/prefix.pch %s -emit-llvm -o %t.rel.ll %s
+// RUN: cat %t.rel.ll | FileCheck %s --check-prefix=CHECK-REL
+
+// CHECK-REL: !DICompileUnit
+// CHECK-REL-SAME:           file: ![[C:[0-9]+]]
+// CHECK-REL-NOT: dwoId
+// CHECK-REL: ![[C]] = !DIFile({{.*}}directory: "[[DIR:.*]]"
+// CHECK-REL: !DICompileUnit(
+// CHECK-REL-SAME:           file: ![[PCH:[0-9]+]]
+// CHECK-REL-SAME:           splitDebugFilename: "prefix.pch"
+// CHECK-REL: ![[PCH]] = !DIFile({{.*}}directory: "[[DIR]]{{.*}}pchdir"
+
+// ---------------------------------------------------------------------
+// Absolute PCH.
+// ---------------------------------------------------------------------
+//
+// RUN: %clang_cc1 -fmodule-format=obj -emit-pch                \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -o %t/prefix.pch %S/debug-info-limited-struct.h
+//
+// RUN: %clang_cc1 -debug-info-kind=standalone                  \
+// RUN:     -dwarf-ext-refs -fmodule-format=obj                 \
+// RUN:     -triple %itanium_abi_triple                         \
+// RUN:     -include-pch %t/prefix.pch %s -emit-llvm -o %t.abs.ll %s
+// RUN: cat %t.abs.ll | FileCheck %s --check-prefix=CHECK-ABS
+
+// CHECK-ABS: !DICompileUnit
+// CHECK-ABS-SAME:           file: ![[C:[0-9]+]]
+// CHECK-ABS-NOT: dwoId
+// CHECK-ABS: ![[C]] = !DIFile({{.*}}directory: "[[DIR:.*]]"
+// CHECK-ABS: !DICompileUnit(
+// CHECK-ABS-SAME:           file: ![[PCH:[0-9]+]]
+// CHECK-ABS-SAME:           splitDebugFilename: "prefix.pch"
+// CHECK-ABS: ![[PCH]] = !DIFile({{.*}}directory: "[[DIR]]"
diff --git a/test/PCH/include-timestamp.cpp b/test/PCH/include-timestamp.cpp
new file mode 100644
index 0000000000000..d7d0fab13ac53
--- /dev/null
+++ b/test/PCH/include-timestamp.cpp
@@ -0,0 +1,32 @@
+// Test that the timestamp is not included in the produced pch file with
+// -fno-pch-timestamp.
+
+// Copying files allow for read-only checkouts to run this test.
+// RUN: cp %S/Inputs/pragma-once2-pch.h %T
+// RUN: cp %S/Inputs/pragma-once2.h %T
+// RUN: cp %s %t1.cpp
+
+// Check timestamp is included by default.
+// RUN: %clang_cc1 -x c++-header -emit-pch -o %t %T/pragma-once2-pch.h
+// RUN: touch -m -a -t 201008011501 %T/pragma-once2.h
+// RUN: not %clang_cc1 -include-pch %t %t1.cpp 2>&1 | FileCheck -check-prefix=CHECK-TIMESTAMP %s
+
+// Check bitcode output as well.
+// RUN: llvm-bcanalyzer -dump %t | FileCheck -check-prefix=CHECK-BITCODE-TIMESTAMP-ON %s
+
+// Check timestamp inclusion is disabled by -fno-pch-timestamp.
+// RUN: %clang_cc1 -x c++-header -emit-pch -o %t %T/pragma-once2-pch.h -fno-pch-timestamp
+// RUN: touch -m -a -t 201008011502 %T/pragma-once2.h
+// RUN: %clang_cc1 -include-pch %t %t1.cpp 2>&1
+
+// Check bitcode output as well.
+// RUN: llvm-bcanalyzer -dump %t | FileCheck -check-prefix=CHECK-BITCODE-TIMESTAMP-OFF %s
+
+#include "pragma-once2.h"
+
+void g() { f(); }
+
+// CHECK-BITCODE-TIMESTAMP-ON: <INPUT_FILE abbrevid={{.*}} op0={{.*}} op1={{.*}} op2={{[^0]}}
+// CHECK-BITCODE-TIMESTAMP-OFF: <INPUT_FILE abbrevid={{.*}} op0={{.*}} op1={{.*}} op2={{[0]}}
+
+// CHECK-TIMESTAMP: fatal error: file {{.*}} has been modified since the precompiled header {{.*}} was built
diff --git a/test/PCH/missing-file.cpp b/test/PCH/missing-file.cpp
index 502a9db65799e..8bdb08d6373de 100644
--- a/test/PCH/missing-file.cpp
+++ b/test/PCH/missing-file.cpp
@@ -4,16 +4,15 @@
 // RUN: echo 'struct S{char c; int i; }; void foo() {}' > %t.h
 // RUN: echo 'template <typename T> void tf() { T::foo(); }' >> %t.h
 // RUN: %clang_cc1 -x c++ -emit-pch -o %t.h.pch %t.h
-
-// %t.h might be touched by scanners as a hot file on Windows,
-// to fail to remove %.h with single run.
-// FIXME: Do we really want to work around bugs in virus checkers here?
-// RUN: rm %t.h || rm %t.h || rm %t.h
+// RUN: rm %t.h
 
 // Check diagnostic with location in original source:
 // RUN: not %clang_cc1 -include-pch %t.h.pch -emit-obj -o %t.o %s 2> %t.stderr
 // RUN: grep 'could not find file' %t.stderr
 
+// Oftentimes on Windows there are open handles, and deletion will fail.
+// REQUIRES: can-remove-opened-file
+
 void qq(S*) {}
 
 #ifdef REDECL
diff --git a/test/PCH/opencl-extensions.cl b/test/PCH/opencl-extensions.cl
index a22b007f9d540..d6d541658a18e 100644
--- a/test/PCH/opencl-extensions.cl
+++ b/test/PCH/opencl-extensions.cl
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-pch -o %t %s
-// RUN: %clang_cc1 -include-pch %t -fsyntax-only %s 
+// RUN: %clang_cc1 -emit-pch -o %t %s -triple spir-unknown-unknown
+// RUN: %clang_cc1 -include-pch %t -fsyntax-only %s  -triple spir-unknown-unknown
 
 #ifndef HEADER
 #define HEADER
diff --git a/test/PCH/pr27445.cpp b/test/PCH/pr27445.cpp
new file mode 100644
index 0000000000000..2a4af5e827616
--- /dev/null
+++ b/test/PCH/pr27445.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -x c++ %S/Inputs/pr27445.h -emit-pch -o %t.pch
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions %s -include-pch %t.pch -emit-llvm -o - | FileCheck %s
+
+class A;
+void fn1(A &) {}
+
+class __declspec(dllexport) A {
+  int operator=(A) { return field_; }
+  void (*on_arena_allocation_)(Info);
+  int field_;
+};
+
+// CHECK: %class.A = type { void (%struct.Info*)*, i32 }
+// CHECK: %struct.Info = type { i32 (...)** }
diff --git a/test/PCH/pragma-comment.c b/test/PCH/pragma-comment.c
new file mode 100644
index 0000000000000..07c3d40d3b1aa
--- /dev/null
+++ b/test/PCH/pragma-comment.c
@@ -0,0 +1,25 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+#ifndef HEADER
+#define HEADER
+#pragma comment(lib, "foo.lib")
+
+#else
+
+// CHECK: "/DEFAULTLIB:foo.lib"
+
+#endif
diff --git a/test/PCH/pragma-detect_mismatch.c b/test/PCH/pragma-detect_mismatch.c
new file mode 100644
index 0000000000000..ced4cf9d159b9
--- /dev/null
+++ b/test/PCH/pragma-detect_mismatch.c
@@ -0,0 +1,25 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include %s -o - | FileCheck %s
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple thumbv7-windows -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-pch -o %t
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple x86_64-pc-win32 -fms-extensions -emit-llvm -include-pch %t -o - | FileCheck %s
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+#ifndef HEADER
+#define HEADER
+#pragma detect_mismatch("FruitKind", "Jaboticaba")
+
+#else
+
+// CHECK: "/FAILIFMISMATCH:\22FruitKind=Jaboticaba\22"
+
+#endif
diff --git a/test/PCH/pragma-loop.cpp b/test/PCH/pragma-loop.cpp
index 2640020f00077..5975816f69a61 100644
--- a/test/PCH/pragma-loop.cpp
+++ b/test/PCH/pragma-loop.cpp
@@ -7,9 +7,11 @@
 // CHECK: #pragma clang loop unroll_count(16)
 // CHECK: #pragma clang loop interleave_count(8)
 // CHECK: #pragma clang loop vectorize_width(4)
+// CHECK: #pragma clang loop distribute(enable)
 // CHECK: #pragma clang loop unroll(disable)
 // CHECK: #pragma clang loop interleave(disable)
 // CHECK: #pragma clang loop vectorize(enable)
+// CHECK: #pragma clang loop distribute(disable)
 // CHECK: #pragma clang loop unroll(full)
 // CHECK: #pragma clang loop interleave(enable)
 // CHECK: #pragma clang loop vectorize(disable)
@@ -40,6 +42,7 @@ public:
 #pragma clang loop vectorize(enable)
 #pragma clang loop interleave(disable)
 #pragma clang loop unroll(disable)
+#pragma clang loop distribute(enable)
     while (i - 1 < Length) {
       List[i] = i;
       i++;
@@ -51,6 +54,7 @@ public:
 #pragma clang loop vectorize(disable)
 #pragma clang loop interleave(enable)
 #pragma clang loop unroll(full)
+#pragma clang loop distribute(disable)
     while (i - 3 < Length) {
       List[i] = i;
       i++;
diff --git a/test/PCH/pragma-ms_struct.cpp b/test/PCH/pragma-ms_struct.cpp
new file mode 100644
index 0000000000000..ac2a1e87b23d1
--- /dev/null
+++ b/test/PCH/pragma-ms_struct.cpp
@@ -0,0 +1,41 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -fsyntax-only -include %s -verify -std=c++11
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -emit-pch -o %t -std=c++11
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-apple-darwin9 -fsyntax-only -include-pch %t -verify -std=c++11
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+struct SOffH {
+  short m : 9;
+  int q : 12;
+};
+
+#pragma ms_struct on
+
+struct SOnH {
+  short m : 9;
+  int q : 12;
+};
+
+#else
+
+struct SOnC {
+  short m : 9;
+  int q : 12;
+};
+
+static_assert(sizeof(SOffH) == 4, "");
+static_assert(sizeof(SOnH) == 8, "");
+static_assert(sizeof(SOnC) == 8, "");
+
+#endif
diff --git a/test/PCH/pragma-pointers_to_members.cpp b/test/PCH/pragma-pointers_to_members.cpp
new file mode 100644
index 0000000000000..53edd6b085d9a
--- /dev/null
+++ b/test/PCH/pragma-pointers_to_members.cpp
@@ -0,0 +1,34 @@
+// Test this without pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32 -fms-extensions -fsyntax-only -include %s -verify -std=c++11
+
+// Test with pch.
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32  -fms-extensions -emit-pch -o %t -std=c++11
+// RUN: %clang_cc1 %s -Wunknown-pragmas -Werror -triple i386-pc-win32  -fms-extensions -fsyntax-only -include-pch %t -verify -std=c++11
+
+// The first run line creates a pch, and since at that point HEADER is not
+// defined, the only thing contained in the pch is the pragma. The second line
+// then includes that pch, so HEADER is defined and the actual code is compiled.
+// The check then makes sure that the pragma is in effect in the file that
+// includes the pch.
+
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+struct S0;
+static_assert(sizeof(int S0::*) == 12, "");
+
+struct S1;
+struct S2;
+
+#pragma pointers_to_members(full_generality, single_inheritance)
+
+static_assert(sizeof(int S1::*) == 4, "");
+
+#else
+
+static_assert(sizeof(int S2::*) == 4, "");
+static_assert(sizeof(int S0::*) == 12, "");
+
+#endif
diff --git a/test/PCH/reloc.c b/test/PCH/reloc.c
index bf70ab6f69222..14788f019f137 100644
--- a/test/PCH/reloc.c
+++ b/test/PCH/reloc.c
@@ -1,7 +1,7 @@
 // RUN: %clang -target x86_64-apple-darwin10 --relocatable-pch -o %t \
-// RUN:   -isysroot %S/libroot %S/libroot/usr/include/reloc.h
+// RUN:   -isysroot %S/Inputs/libroot %S/Inputs/libroot/usr/include/reloc.h
 // RUN: %clang -target x86_64-apple-darwin10 -fsyntax-only \
-// RUN:   -include-pch %t -isysroot %S/libroot %s -Xclang -verify
+// RUN:   -include-pch %t -isysroot %S/Inputs/libroot %s -Xclang -verify
 // RUN: not %clang -target x86_64-apple-darwin10 -include-pch %t %s
 // REQUIRES: x86-registered-target
 
@@ -11,5 +11,5 @@ int x = 2; // expected-error{{redefinition}}
 int y = 5; // expected-error{{redefinition}}
 
 
-// expected-note@libroot/usr/include/reloc.h:13{{previous definition}}
-// expected-note@libroot/usr/include/reloc2.h:14{{previous definition}}
+// expected-note@Inputs/libroot/usr/include/reloc.h:13{{previous definition}}
+// expected-note@Inputs/libroot/usr/include/reloc2.h:14{{previous definition}}
diff --git a/test/PCH/type_pack_element.cpp b/test/PCH/type_pack_element.cpp
new file mode 100644
index 0000000000000..c4ed6c81e2e9c
--- /dev/null
+++ b/test/PCH/type_pack_element.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -std=c++14 -x c++-header %s -emit-pch -o %t.pch
+// RUN: %clang_cc1 -std=c++14 -x c++ /dev/null -include-pch %t.pch
+
+template <int i>
+struct X { };
+
+using SizeT = decltype(sizeof(int));
+
+template <SizeT i, typename ...T>
+using TypePackElement = __type_pack_element<i, T...>;
+
+void fn1() {
+  X<0> x0 = TypePackElement<0, X<0>, X<1>, X<2>>{};
+  X<1> x1 = TypePackElement<1, X<0>, X<1>, X<2>>{};
+  X<2> x2 = TypePackElement<2, X<0>, X<1>, X<2>>{};
+}
diff --git a/test/PCH/uuidof.cpp b/test/PCH/uuidof.cpp
new file mode 100644
index 0000000000000..207a8dafee3b1
--- /dev/null
+++ b/test/PCH/uuidof.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fms-extensions -x c++-header -emit-pch -o %t %s
+// RUN: %clang_cc1 -fms-extensions -include-pch %t -fsyntax-only %s -emit-llvm -o - | FileCheck %s
+
+#ifndef HEADER
+#define HEADER
+struct _GUID {};
+const _GUID &x = __uuidof(0);
+// CHECK-DAG: @_GUID_00000000_0000_0000_0000_000000000000
+#endif
diff --git a/test/Parser/MicrosoftExtensions.c b/test/Parser/MicrosoftExtensions.c
index e58a7455c083f..39ab51f31fa67 100644
--- a/test/Parser/MicrosoftExtensions.c
+++ b/test/Parser/MicrosoftExtensions.c
@@ -35,6 +35,9 @@ void test_ms_alignof_alias(void) {
 /* Charify extension. */
 #define FOO(x) #@x
 char x = FOO(a);
+#define HASHAT #@
+#define MISSING_ARG(x) #@
+/* expected-error@-1 {{'#@' is not followed by a macro parameter}} */
 
 typedef enum E { e1 };
 
diff --git a/test/Parser/cxx-altivec.cpp b/test/Parser/cxx-altivec.cpp
index ac20de288f172..5b0da6c5e6fd9 100644
--- a/test/Parser/cxx-altivec.cpp
+++ b/test/Parser/cxx-altivec.cpp
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 -triple=powerpc-apple-darwin8 -faltivec -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -triple=powerpc64-unknown-linux-gnu -faltivec -fsyntax-only -verify -std=c++11 %s
 // RUN: %clang_cc1 -triple=powerpc64le-unknown-linux-gnu -faltivec -fsyntax-only -verify -std=c++11 %s
+#include <altivec.h>
 
 __vector char vv_c;
 __vector signed char vv_sc;
diff --git a/test/Parser/cxx-ambig-paren-expr-asan.cpp b/test/Parser/cxx-ambig-paren-expr-asan.cpp
new file mode 100644
index 0000000000000..ec9d6b9da39d0
--- /dev/null
+++ b/test/Parser/cxx-ambig-paren-expr-asan.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+
+// This syntax error used to cause use-after free due to token local buffer
+// in ParseCXXAmbiguousParenExpression.
+int H((int()[)]);
+// expected-error@-1 {{expected expression}}
+// expected-error@-2 {{expected ']'}}
+// expected-note@-3 {{to match this '['}}
+// expected-error@-4 {{expected ';' after top level declarator}}
diff --git a/test/Parser/cxx-ambig-paren-expr.cpp b/test/Parser/cxx-ambig-paren-expr.cpp
index 398820567237d..cc509f7b059f3 100644
--- a/test/Parser/cxx-ambig-paren-expr.cpp
+++ b/test/Parser/cxx-ambig-paren-expr.cpp
@@ -21,8 +21,14 @@ void f() {
   struct S{int operator()();};
   (S())();
 
-  // FIXME: Special case: "++" is postfix here, not prefix
-  // (S())++;
+  // Special case: "++" is postfix here, not prefix
+  (S())++; // expected-error {{cannot increment value of type 'S'}}
+
+  struct X { int &operator++(int); X operator[](int); int &operator++(); };
+  int &postfix_incr = (X()[3])++;
+  (X())++ ++; // ok, not a C-style cast
+  (X())++ ++X(); // expected-error {{C-style cast from 'int' to 'X ()'}}
+  int q = (int)++(x);
 }
 
 // Make sure we do tentative parsing correctly in conditions.
diff --git a/test/Parser/cxx-casting.cpp b/test/Parser/cxx-casting.cpp
index 43885bff27a5f..b1ae591865eb1 100644
--- a/test/Parser/cxx-casting.cpp
+++ b/test/Parser/cxx-casting.cpp
@@ -37,7 +37,7 @@ char postfix_expr_test()
 // This was being incorrectly tentatively parsed.
 namespace test1 {
   template <class T> class A {}; // expected-note 2{{here}}
-  void foo() { A<int>(*(A<int>*)0); }
+  void foo() { A<int>(*(A<int>*)0); } // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
 }
 
 typedef char* c;
diff --git a/test/Parser/cxx-class.cpp b/test/Parser/cxx-class.cpp
index 9e907f1b1c12f..3cc006af23dc0 100644
--- a/test/Parser/cxx-class.cpp
+++ b/test/Parser/cxx-class.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -pedantic -fcxx-exceptions -std=c++11 %s
+
 class C;
 class C {
 public:
@@ -69,11 +72,30 @@ public; // expected-error{{expected ':'}}
 };
 
 class F {
-    int F1 { return 1; } // expected-error{{function definition does not declare parameters}}
-    void F2 {} // expected-error{{function definition does not declare parameters}}
+    int F1 { return 1; }
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{function definition does not declare parameters}}
+#else
+    // expected-error@-4 {{expected expression}}
+    // expected-error@-5 {{expected}}
+    // expected-note@-6 {{to match this '{'}}
+    // expected-error@-7 {{expected ';' after class}}
+#endif
+
+    void F2 {}
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{function definition does not declare parameters}}
+#else
+    // expected-error@-4 {{variable has incomplete type 'void'}}
+    // expected-error@-5 {{expected ';' after top level declarator}}
+#endif
+
     typedef int F3() { return 0; } // expected-error{{function definition declared 'typedef'}}
     typedef void F4() {} // expected-error{{function definition declared 'typedef'}}
 };
+#if __cplusplus >= 201103L
+// expected-error@-2 {{extraneous closing brace}}
+#endif
 
 namespace ctor_error {
   class Foo {};
@@ -203,14 +225,38 @@ namespace BadFriend {
 }
 
 class PR20760_a {
-  int a = ); // expected-warning {{extension}} expected-error {{expected expression}}
-  int b = }; // expected-warning {{extension}} expected-error {{expected expression}}
-  int c = ]; // expected-warning {{extension}} expected-error {{expected expression}}
+  int a = ); // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int b = }; // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int c = ]; // expected-error {{expected expression}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
 };
 class PR20760_b {
-  int d = d); // expected-warning {{extension}} expected-error {{expected ';'}}
-  int e = d]; // expected-warning {{extension}} expected-error {{expected ';'}}
-  int f = d // expected-warning {{extension}} expected-error {{expected ';'}}
+  int d = d); // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int e = d]; // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
+  int f = d // expected-error {{expected ';'}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
+
 };
 
 namespace PR20887 {
diff --git a/test/Parser/cxx-decl.cpp b/test/Parser/cxx-decl.cpp
index be79eb433fd03..8a7a388605392 100644
--- a/test/Parser/cxx-decl.cpp
+++ b/test/Parser/cxx-decl.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions %s
+// RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions -std=c++98 %s
+// RUN: %clang_cc1 -verify -fsyntax-only -triple i386-linux -pedantic-errors -fcxx-exceptions -fexceptions -std=c++11 %s
 
 const char const *x10; // expected-error {{duplicate 'const' declaration specifier}}
 
@@ -46,7 +48,10 @@ class asm_class_test {
   void foo() __asm__("baz");
 };
 
-enum { fooenum = 1, }; // expected-error {{commas at the end of enumerator lists are a C++11 extension}}
+enum { fooenum = 1, };
+#if __cplusplus <= 199711L
+// expected-error@-2 {{commas at the end of enumerator lists are a C++11 extension}}
+#endif
 
 struct a {
   int Type : fooenum;
@@ -81,7 +86,11 @@ namespace Commas {
   (global5),
   *global6,
   &global7 = global1,
-  &&global8 = static_cast<int&&>(global1), // expected-error 2{{rvalue reference}}
+  &&global8 = static_cast<int&&>(global1),
+#if __cplusplus <= 199711L
+  // expected-error@-2 2{{rvalue references are a C++11 extension}}
+#endif
+
   S::a,
   global9,
   global10 = 0,
@@ -185,7 +194,13 @@ namespace PR15017 {
 }
 
 // Ensure we produce at least some diagnostic for attributes in C++98.
-[[]] struct S; // expected-error 2{{}}
+[[]] struct S;
+#if __cplusplus <= 199711L
+// expected-error@-2 {{expected expression}}
+// expected-error@-3 {{expected unqualified-id}}
+#else
+// expected-error@-5 {{an attribute list cannot appear here}}
+#endif
 
 namespace test7 {
   struct Foo {
@@ -212,14 +227,20 @@ namespace PR5066 {
   template<typename T> struct X {};
   X<int N> x; // expected-error {{type-id cannot have a name}}
 
-  using T = int (*T)(); // expected-error {{type-id cannot have a name}} expected-error {{C++11}}
+  using T = int (*T)(); // expected-error {{type-id cannot have a name}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{alias declarations are a C++11 extensio}}
+#endif
+
 }
 
 namespace PR17255 {
 void foo() {
-  typename A::template B<>; // expected-error {{use of undeclared identifier 'A'}} \
-                            // expected-error {{expected a qualified name after 'typename'}} \
-                            // expected-error {{'template' keyword outside of a template}}
+  typename A::template B<>; // expected-error {{use of undeclared identifier 'A'}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{'template' keyword outside of a template}}
+#endif
+  // expected-error@-4 {{expected a qualified name after 'typename'}}
 }
 }
 
@@ -236,12 +257,25 @@ namespace DuplicateFriend {
   struct A {
     friend void friend f(); // expected-warning {{duplicate 'friend' declaration specifier}}
     friend struct B friend; // expected-warning {{duplicate 'friend' declaration specifier}}
+#if __cplusplus >= 201103L
+    // expected-error@-2 {{'friend' must appear first in a non-function declaration}}
+#endif
   };
 }
 
 // PR8380
 extern ""      // expected-error {{unknown linkage language}}
-test6a { ;// expected-error {{C++ requires a type specifier for all declarations}} \
-     // expected-error {{expected ';' after top level declarator}}
+test6a { ;// expected-error {{C++ requires a type specifier for all declarations}}
+#if __cplusplus <= 199711L
+// expected-error@-2 {{expected ';' after top level declarator}}
+#else
+// expected-error@-4 {{expected expression}}
+// expected-note@-5 {{to match this}}
+#endif
   
   int test6b;
+#if __cplusplus >= 201103L
+// expected-error@+3 {{expected}}
+// expected-error@-3 {{expected ';' after top level declarator}}
+#endif
+
diff --git a/test/Parser/cxx-friend.cpp b/test/Parser/cxx-friend.cpp
index ace0ff26e2d9a..a4492ba1a7ab9 100644
--- a/test/Parser/cxx-friend.cpp
+++ b/test/Parser/cxx-friend.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 class C {
   friend class D;
@@ -21,9 +23,20 @@ class B {
   // 'A' here should refer to the declaration above.  
   friend class A;
 
-  friend C; // expected-warning {{specify 'class' to befriend}}
-  friend U; // expected-warning {{specify 'union' to befriend}}
-  friend int; // expected-warning {{non-class friend type 'int'}}
+  friend C;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'class' to befriend 'C'}}
+#endif
+
+  friend U;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{unelaborated friend declaration is a C++11 extension; specify 'union' to befriend 'U'}}
+#endif
+
+  friend int;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{non-class friend type 'int' is a C++11 extension}}
+#endif
 
   friend void myfunc();
 
diff --git a/test/Parser/cxx-invalid-for-range.cpp b/test/Parser/cxx-invalid-for-range.cpp
new file mode 100644
index 0000000000000..557c1da209a9a
--- /dev/null
+++ b/test/Parser/cxx-invalid-for-range.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+// From PR23057 comment #18 (https://llvm.org/bugs/show_bug.cgi?id=23057#c18).
+
+namespace N {
+  int X[10]; // expected-note{{declared here}}}}
+}
+
+void f1() {
+  for (auto operator new : X); // expected-error{{'operator new' cannot be the name of a variable or data member}}
+                               // expected-error@-1{{use of undeclared identifier 'X'; did you mean 'N::X'?}}
+}
+
+void f2() {
+  for (a operator== :) // expected-error{{'operator==' cannot be the name of a variable or data member}}
+                       // expected-error@-1{{expected expression}}
+                       // expected-error@-2{{unknown type name 'a'}}
+} // expected-error{{expected statement}}
diff --git a/test/Parser/cxx-invalid-function-decl.cpp b/test/Parser/cxx-invalid-function-decl.cpp
new file mode 100644
index 0000000000000..2db27516eefab
--- /dev/null
+++ b/test/Parser/cxx-invalid-function-decl.cpp
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Check that "::new" and "::delete" in member initializer list are diagnosed
+// correctly and don't lead to infinite loop on parsing.
+
+// Error: X() (initializer on non-constructor), "::new" is skipped.
+void f1() : X() ::new{}; // expected-error{{only constructors take base initializers}}
+
+// Errors: first "::delete" and initializer on non-constructor, others skipped.
+void f2() : ::delete, ::new, X() ::new ::delete{} // expected-error{{expected class member or base class name}}
+                                                  // expected-error@-1{{only constructors take base initializers}}
+
+// Errors: the '::' token, "::delete" and initializer on non-constructor, others skipped.
+void f3() : ::, ::delete X(), ::new {}; // expected-error2{{expected class member or base class name}}
+                                        // expected-error@-1{{only constructors take base initializers}}
+
+template <class T>
+struct Base1 {
+  T x1;
+  Base1(T a1) : x1(a1) {}
+};
+
+template <class T>
+struct Base2 {
+  T x2;
+  Base2(T a2) : x2(a2) {}
+};
+
+struct S : public Base1<int>, public Base2<float> {
+  int x;
+
+  // 1-st initializer is correct (just missing ','), 2-nd incorrect, skip other.
+  S() : ::Base1<int>(0) ::new, ::Base2<float>(1.0) ::delete x(2) {} // expected-error{{expected class member or base class name}}
+                                                                    // expected-error@-1{{missing ',' between base or member initializers}}
+
+  // 1-st and 2-nd are correct, errors: '::' and "::new", others skipped.
+  S(int a) : Base1<int>(a), ::Base2<float>(1.0), ::, // expected-error{{expected class member or base class name}}
+             ::new, ! ::delete, ::Base2<() x(3) {}   // expected-error{{expected class member or base class name}}
+
+  // All initializers are correct, nothing to skip, diagnose 2 missing commas.
+  S(const S &) : Base1<int>(0) ::Base2<float>(1.0) x(2) {} // expected-error2{{missing ',' between base or member initializers}}
+};
diff --git a/test/Parser/cxx0x-attributes.cpp b/test/Parser/cxx0x-attributes.cpp
index 7eec5761ea055..906d72b087cb5 100644
--- a/test/Parser/cxx0x-attributes.cpp
+++ b/test/Parser/cxx0x-attributes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 -Wc++14-compat -Wc++14-extensions -Wc++1z-extensions %s
 
 // Need std::initializer_list
 namespace std {
@@ -336,7 +336,6 @@ namespace {
   // expected-warning@-1 {{use of the 'deprecated' attribute is a C++14 extension}}
   [[deprecated()]] void foo();
   // expected-error@-1 {{parentheses must be omitted if 'deprecated' attribute's argument list is empty}}
-  // expected-warning@-2 {{use of the 'deprecated' attribute is a C++14 extension}}
   [[gnu::deprecated()]] void quux();
 }
 
@@ -347,6 +346,18 @@ deprecated
 ]] void bad();
 }
 
+int fallthru(int n) {
+  switch (n) {
+  case 0:
+    n += 5;
+    [[fallthrough]]; // expected-warning {{use of the 'fallthrough' attribute is a C++1z extension}}
+  case 1:
+    n *= 2;
+    break;
+  }
+  return n;
+}
+
 #define attr_name bitand
 #define attr_name_2(x) x
 #define attr_name_3(x, y) x##y
diff --git a/test/Parser/cxx0x-condition.cpp b/test/Parser/cxx0x-condition.cpp
index 8b64bcf1273d5..071e09e4158f0 100644
--- a/test/Parser/cxx0x-condition.cpp
+++ b/test/Parser/cxx0x-condition.cpp
@@ -23,9 +23,9 @@ void f() {
 
   if (S b(a)) {} // expected-error {{variable declaration in condition cannot have a parenthesized initializer}}
 
-  if (S b(n)) {} // expected-error {{a function type is not allowed here}} expected-error {{must have an initializer}}
+  if (S b(n)) {} // expected-error {{a function type is not allowed here}}
   if (S b(n) = 0) {} // expected-error {{a function type is not allowed here}}
-  if (S b(n) == 0) {} // expected-error {{a function type is not allowed here}} expected-error {{did you mean '='?}}
+  if (S b(n) == 0) {} // expected-error {{a function type is not allowed here}}
 
   S s(a);
   if (S{s}) {} // ok
diff --git a/test/Parser/cxx0x-decl.cpp b/test/Parser/cxx0x-decl.cpp
index 23f46a1847888..c4f03566029ad 100644
--- a/test/Parser/cxx0x-decl.cpp
+++ b/test/Parser/cxx0x-decl.cpp
@@ -17,6 +17,8 @@ auto g() -> enum E {
   return E();
 }
 
+int decltype(f())::*ptr_mem_decltype;
+
 class ExtraSemiAfterMemFn {
   // Due to a peculiarity in the C++11 grammar, a deleted or defaulted function
   // is permitted to be followed by either one or two semicolons.
diff --git a/test/Parser/cxx1z-constexpr-lambdas.cpp b/test/Parser/cxx1z-constexpr-lambdas.cpp
new file mode 100644
index 0000000000000..ea000e361ccb6
--- /dev/null
+++ b/test/Parser/cxx1z-constexpr-lambdas.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++1z %s -verify 
+// RUN: %clang_cc1 -std=c++14 %s -verify 
+// RUN: %clang_cc1 -std=c++11 %s -verify 
+
+
+auto XL0 = [] constexpr { }; //expected-error{{requires '()'}} expected-error{{expected body}}
+auto XL1 = [] () mutable 
+                 mutable     //expected-error{{cannot appear multiple times}}
+                 mutable { }; //expected-error{{cannot appear multiple times}}
+
+#if __cplusplus > 201402L
+auto XL2 = [] () constexpr mutable constexpr { }; //expected-error{{cannot appear multiple times}}
+auto L = []() mutable constexpr { };
+auto L2 = []() constexpr { };
+auto L4 = []() constexpr mutable { }; 
+auto XL16 = [] () constexpr
+                  mutable
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  mutable     //expected-error{{cannot appear multiple times}}
+                  mutable     //expected-error{{cannot appear multiple times}}
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  constexpr   //expected-error{{cannot appear multiple times}}
+                  { };
+
+#else
+auto L = []() mutable constexpr {return 0; }; //expected-warning{{is a C++1z extension}}
+auto L2 = []() constexpr { return 0;};//expected-warning{{is a C++1z extension}}
+auto L4 = []() constexpr mutable { return 0; }; //expected-warning{{is a C++1z extension}}
+#endif
+
+
diff --git a/test/Parser/cxx1z-init-statement.cpp b/test/Parser/cxx1z-init-statement.cpp
new file mode 100644
index 0000000000000..3d119ef8e709c
--- /dev/null
+++ b/test/Parser/cxx1z-init-statement.cpp
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -Wno-vexing-parse
+
+int g, h;
+typedef int T;
+int f() {
+  // init-statement declarations
+  if (T n = 0; n != 0) {}
+  if (T f(); f()) {}
+  if (T(f()); f()) {}
+  if (T(f()), g, h; f()) {}
+  if (T f(); f()) {}
+  if (T f(), g, h; f()) {}
+  if (T(n) = 0; n) {}
+
+  // init-statement expressions
+  if (T{f()}; f()) {}
+  if (T{f()}, g, h; f()) {} // expected-warning 2{{unused}}
+  if (T(f()), g, h + 1; f()) {} // expected-warning 2{{unused}}
+
+  // condition declarations
+  if (T(n){g}) {}
+  if (T f()) {} // expected-error {{function type}}
+  if (T f(), g, h) {} // expected-error {{function type}}
+  if (T(n) = 0) {}
+
+  // condition expressions
+  if (T(f())) {}
+  if (T{f()}) {}
+  if (T(f()), g, h) {} // expected-warning 2{{unused}}
+  if (T{f()}, g, h) {} // expected-warning 2{{unused}}
+
+  // none of the above, disambiguated as expression (can't be a declaration)
+  if (T(n)(g)) {} // expected-error {{undeclared identifier 'n'}}
+  if (T(n)(int())) {} // expected-error {{undeclared identifier 'n'}}
+
+  // Likewise for 'switch'
+  switch (int n; n) {}
+  switch (g; int g = 5) {}
+
+  if (int a, b; int c = a) { // expected-note 6{{previous}}
+    int a; // expected-error {{redefinition}}
+    int b; // expected-error {{redefinition}}
+    int c; // expected-error {{redefinition}}
+  } else {
+    int a; // expected-error {{redefinition}}
+    int b; // expected-error {{redefinition}}
+    int c; // expected-error {{redefinition}}
+  }
+
+  return 0;
+}
diff --git a/test/Parser/extra-semi.cpp b/test/Parser/extra-semi.cpp
index 1a44dae411e2e..7287f856d8c97 100644
--- a/test/Parser/extra-semi.cpp
+++ b/test/Parser/extra-semi.cpp
@@ -5,7 +5,6 @@
 
 void test1(int a;) { // expected-error{{unexpected ';' before ')'}}
   while (a > 5;) {} // expected-error{{unexpected ';' before ')'}}
-  if (int b = 10;) {} // expected-error{{unexpected ';' before ')'}}
   for (int c  = 0; c < 21; ++c;) {} // expected-error{{unexpected ';' before ')'}}
   int d = int(3 + 4;); // expected-error{{unexpected ';' before ')'}}
   int e[5;]; // expected-error{{unexpected ';' before ']'}}
diff --git a/test/Parser/ms-anachronism.c b/test/Parser/ms-anachronism.c
new file mode 100644
index 0000000000000..37676397873cc
--- /dev/null
+++ b/test/Parser/ms-anachronism.c
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 -triple i686-windows-msvc -fms-extensions -fsyntax-only -verify %s
+
+struct {} __cdecl s; // expected-warning {{'__cdecl' only applies to function types; type here is 'struct}}
diff --git a/test/Parser/ms-inline-asm.c b/test/Parser/ms-inline-asm.c
index 6dde5f5c0d69b..e3c392406957f 100644
--- a/test/Parser/ms-inline-asm.c
+++ b/test/Parser/ms-inline-asm.c
@@ -53,6 +53,10 @@ void t11() {
 void t12() {
   __asm jmp label // expected-error {{use of undeclared label 'label'}}
 }
+void t13() {
+  __asm m{o}v eax, ebx // expected-error {{expected identifier}} expected-error {{use of undeclared label '{o}v eax, ebx'}}
+}
+
 int t_fail() { // expected-note {{to match this}}
   __asm 
-  __asm { // expected-error 2 {{expected}} expected-note {{to match this}}
+  __asm { // expected-error 3 {{expected}} expected-note {{to match this}}
diff --git a/test/Parser/objc-available.m b/test/Parser/objc-available.m
new file mode 100644
index 0000000000000..ca30753144359
--- /dev/null
+++ b/test/Parser/objc-available.m
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunguarded-availability -triple x86_64-apple-macosx10.10.0 -verify %s
+
+void f() {
+
+  if (@available(macos 10.12, *)) {}
+  else if (@available(macos 10.11, *)) {}
+  else {}
+
+  (void)__builtin_available(ios 8, macos 10.10, *);
+
+  (void)@available(macos 10.11); // expected-error{{must handle potential future platforms with '*'}}
+  (void)@available(macos 10.11, macos 10.11, *); // expected-error{{version for 'macos' already specified}}
+
+  (void)@available(erik_os 10.11, *); // expected-error{{unrecognized platform name erik_os}}
+
+  (void)@available(erik_os 10.10, hat_os 1.0, *); // expected-error 2 {{unrecognized platform name}}
+
+  (void)@available(ios 8, *); // expected-warning{{using '*' case here, platform macos is not accounted for}}
+
+  (void)@available(); // expected-error{{expected a platform name here}}
+  (void)@available(macos 10.10,); // expected-error{{expected a platform name here}}
+  (void)@available(macos); // expected-error{{expected a version}}
+  (void)@available; // expected-error{{expected '('}}
+}
diff --git a/test/Parser/objc-class-property.m b/test/Parser/objc-class-property.m
new file mode 100644
index 0000000000000..e4c3b0766b4a9
--- /dev/null
+++ b/test/Parser/objc-class-property.m
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+@interface Root
+-(id) alloc;
+-(id) init;
+@end
+
+@interface A : Root {
+  int x;
+  int z;
+}
+@property int x;
+@property int y;
+@property int z;
+@property(readonly) int ro, ro2;
+@property (class) int c;
+@end
+
+@implementation A
+@dynamic x;
+@synthesize z;
+@dynamic c;
+@end
+
+int test() {
+  A *a = [[A alloc] init];
+  return a.x;
+}
diff --git a/test/Parser/objc-default-ctor-init.mm b/test/Parser/objc-default-ctor-init.mm
new file mode 100644
index 0000000000000..ea4c064d77914
--- /dev/null
+++ b/test/Parser/objc-default-ctor-init.mm
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 -std=c++11 -ast-dump %s | FileCheck %s
+// CHECK: CXXCtorInitializer Field {{.*}} 'ptr' 'void *'
+// CHECK: CXXCtorInitializer Field {{.*}} 'q' 'struct Q'
+
+@interface NSObject
+@end
+
+@interface I : NSObject
+@end
+
+struct Q { Q(); };
+
+struct S {
+  S();
+  void *ptr = nullptr;
+  Q q;
+};
+
+@implementation I
+S::S() {}
+@end
diff --git a/test/Parser/objcxx11-messaging-and-lambda.mm b/test/Parser/objcxx11-messaging-and-lambda.mm
new file mode 100644
index 0000000000000..002f3e9c710aa
--- /dev/null
+++ b/test/Parser/objcxx11-messaging-and-lambda.mm
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+#define OBJCLASS(name) // expected-note {{macro 'OBJCLASS' defined here}}
+
+class NSMutableData;
+
+NSMutableData *test() { // expected-note {{to match this '{'}}
+  NSMutableData *data = [[[OBJCLASS(NSMutableDataOBJCLASS( alloc] init] autorelease]; // expected-error {{unterminated function-like macro invocation}} \
+  // expected-error {{expected ';' at end of declaration}}
+  return data;
+} // expected-error {{expected expression}} expected-error {{expected '}'}}
diff --git a/test/Parser/objcxx11-protocol-in-template.mm b/test/Parser/objcxx11-protocol-in-template.mm
index c5c3b6c75a473..5c80ae9759791 100644
--- a/test/Parser/objcxx11-protocol-in-template.mm
+++ b/test/Parser/objcxx11-protocol-in-template.mm
@@ -8,3 +8,11 @@ template<class T> class vector {};
 
 vector<id<P>> v;
 vector<vector<id<P>>> v2;
+
+@protocol PA;
+@protocol PB;
+
+@class NSArray<ObjectType>;
+typedef int some_t;
+
+id<PA> FA(NSArray<id<PB>> *h, some_t group);
diff --git a/test/Parser/objcxx14-protocol-in-template.mm b/test/Parser/objcxx14-protocol-in-template.mm
new file mode 100644
index 0000000000000..36da92e251f4c
--- /dev/null
+++ b/test/Parser/objcxx14-protocol-in-template.mm
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+
+template<class T> class vector {};
+@protocol P @end
+
+// expected-no-diagnostics
+
+template <typename Functor> void F(Functor functor) {}
+
+// Test protocol in template within lambda capture initializer context.
+void z() {
+  id<P> x = 0;
+  (void)x;
+  F( [ x = vector<id<P>>{} ] {} );
+}
diff --git a/test/Parser/opencl-astype.cl b/test/Parser/opencl-astype.cl
index 72f98a4ace9ff..903c42ee8c14b 100644
--- a/test/Parser/opencl-astype.cl
+++ b/test/Parser/opencl-astype.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple spir-unknown-unknown
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 void test_astype() {
diff --git a/test/Parser/opencl-atomics-cl20.cl b/test/Parser/opencl-atomics-cl20.cl
index cb2f59721acc0..cd37757b97f68 100644
--- a/test/Parser/opencl-atomics-cl20.cl
+++ b/test/Parser/opencl-atomics-cl20.cl
@@ -1,11 +1,14 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
-// RUN: %clang_cc1 %s -verify  -fsyntax-only -cl-std=CL2.0 -DCL20
-// RUN: %clang_cc1 %s -verify  -fsyntax-only -cl-std=CL2.0 -DCL20 -DEXT
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -fsyntax-only -cl-std=CL2.0 -DCL20
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -fsyntax-only -cl-std=CL2.0 -DCL20 -DEXT -Wpedantic-core-features
 
 #ifdef EXT
 #pragma OPENCL EXTENSION cl_khr_int64_base_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics:enable
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
 #endif
 
 void atomic_types_test() {
@@ -44,15 +47,14 @@ void atomic_types_test() {
 // expected-error@-28 {{use of type 'atomic_ulong' (aka '_Atomic(unsigned long)') requires cl_khr_int64_extended_atomics extension to be enabled}}
 // expected-error@-27 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_int64_base_atomics extension to be enabled}}
 // expected-error@-28 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error@-29 {{use of type 'atomic_double' (aka '_Atomic(double)') requires cl_khr_fp64 extension to be enabled}}
-// expected-error-re@-28 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-29 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-29 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-30 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-30 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-31 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
-// expected-error-re@-31 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
-// expected-error-re@-32 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-27 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-28 {{use of type 'atomic_intptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-28 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-29 {{use of type 'atomic_uintptr_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-29 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-30 {{use of type 'atomic_size_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
+// expected-error-re@-30 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_base_atomics extension to be enabled}}
+// expected-error-re@-31 {{use of type 'atomic_ptrdiff_t' (aka '_Atomic({{.+}})') requires cl_khr_int64_extended_atomics extension to be enabled}}
 #endif
 
 #ifdef CL20
diff --git a/test/Parser/opencl-cl20.cl b/test/Parser/opencl-cl20.cl
index b71869919ba9d..b14ad10153df7 100644
--- a/test/Parser/opencl-cl20.cl
+++ b/test/Parser/opencl-cl20.cl
@@ -10,9 +10,9 @@ __generic int * __generic_test(__generic int *arg) {
   return var;  
 }
 #ifndef CL20
-// expected-error@-5 {{OpenCL does not support the '__generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the '__generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the '__generic' type qualifier}}
+// expected-error@-5 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the '__generic' type qualifier}}
 #endif
 
 generic int * generic_test(generic int *arg) {
@@ -20,7 +20,7 @@ generic int * generic_test(generic int *arg) {
   return var;  
 }
 #ifndef CL20
-// expected-error@-5 {{OpenCL does not support the 'generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the 'generic' type qualifier}}
-// expected-error@-6 {{OpenCL does not support the 'generic' type qualifier}}
+// expected-error@-5 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
+// expected-error@-6 {{OpenCL version 1.0 does not support the 'generic' type qualifier}}
 #endif
diff --git a/test/Parser/opencl-image-access.cl b/test/Parser/opencl-image-access.cl
index e08d129214397..99ced8e32bb73 100644
--- a/test/Parser/opencl-image-access.cl
+++ b/test/Parser/opencl-image-access.cl
@@ -1,14 +1,19 @@
-// RUN: %clang_cc1 %s -fsyntax-only
+// RUN: %clang_cc1 %s -fsyntax-only -verify
+// RUN: %clang_cc1 %s -fsyntax-only -verify -cl-std=CL2.0 -DCL20
+// expected-no-diagnostics
 
 __kernel void f__ro(__read_only image2d_t a) { }
 
 __kernel void f__wo(__write_only image2d_t a) { }
 
+#if CL20
 __kernel void f__rw(__read_write image2d_t a) { }
-
+#endif
 
 __kernel void fro(read_only image2d_t a) { }
 
 __kernel void fwo(write_only image2d_t a) { }
 
+#if CL20
 __kernel void frw(read_write image2d_t a) { }
+#endif
diff --git a/test/Parser/opencl-pragma.cl b/test/Parser/opencl-pragma.cl
index 4c48b2a496f78..b002b0854a4d0 100644
--- a/test/Parser/opencl-pragma.cl
+++ b/test/Parser/opencl-pragma.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -verify -pedantic -Wno-empty-translation-unit -fsyntax-only
+// RUN: %clang_cc1 %s -verify -pedantic -Wno-empty-translation-unit -fsyntax-only -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
diff --git a/test/Parser/opencl-storage-class.cl b/test/Parser/opencl-storage-class.cl
index 3d9aef59f0bbc..a8ebc1af39993 100644
--- a/test/Parser/opencl-storage-class.cl
+++ b/test/Parser/opencl-storage-class.cl
@@ -1,15 +1,15 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple spir-unknown-unknown
 
 void test_storage_class_specs()
 {
-  static int a;    // expected-error {{OpenCL does not support the 'static' storage class specifier}}
-  register int b;  // expected-error {{OpenCL does not support the 'register' storage class specifier}}
-  extern int c;    // expected-error {{OpenCL does not support the 'extern' storage class specifier}}
-  auto int d;      // expected-error {{OpenCL does not support the 'auto' storage class specifier}}
+  static int a;    // expected-error {{OpenCL version 1.0 does not support the 'static' storage class specifier}}
+  register int b;  // expected-error {{OpenCL version 1.0 does not support the 'register' storage class specifier}}
+  extern int c;    // expected-error {{OpenCL version 1.0 does not support the 'extern' storage class specifier}}
+  auto int d;      // expected-error {{OpenCL version 1.0 does not support the 'auto' storage class specifier}}
 
 #pragma OPENCL EXTENSION cl_clang_storage_class_specifiers : enable
-  static int e; // expected-error {{program scope variable must reside in constant address space}}
+  static int e; // expected-error {{static local variable must reside in constant address space}}
   register int f;
-  extern int g;
+  extern int g; // expected-error {{extern variable must reside in constant address space}}
   auto int h;
 }
diff --git a/test/Parser/opencl-unroll-hint.cl b/test/Parser/opencl-unroll-hint.cl
new file mode 100644
index 0000000000000..5742dcde2195b
--- /dev/null
+++ b/test/Parser/opencl-unroll-hint.cl
@@ -0,0 +1,8 @@
+//RUN: %clang_cc1 -O0 -cl-std=CL2.0 -fsyntax-only -verify %s
+
+kernel void B (global int *x) {
+  __attribute__((opencl_unroll_hint(42)))
+  if (x[0])                             // expected-error {{OpenCL only supports 'opencl_unroll_hint' attribute on for, while, and do statements}}
+    x[0] = 15;
+}
+
diff --git a/test/Parser/pragma-loop-safety.cpp b/test/Parser/pragma-loop-safety.cpp
index 0776000e51219..ab87dcdcb63e6 100644
--- a/test/Parser/pragma-loop-safety.cpp
+++ b/test/Parser/pragma-loop-safety.cpp
@@ -16,6 +16,7 @@ void test(int *List, int Length) {
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave(assume_safety
 
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(assume_safety)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(assume_safety)
 
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(badidentifier)
diff --git a/test/Parser/pragma-loop.cpp b/test/Parser/pragma-loop.cpp
index b9b5b41efbad7..f42d196ce8bd3 100644
--- a/test/Parser/pragma-loop.cpp
+++ b/test/Parser/pragma-loop.cpp
@@ -116,15 +116,27 @@ void test(int *List, int Length) {
     VList[j] = List[j];
   }
 
+#pragma clang loop distribute(enable)
+  for (int j : VList) {
+    VList[j] = List[j];
+  }
+
+#pragma clang loop distribute(disable)
+  for (int j : VList) {
+    VList[j] = List[j];
+  }
+
   test_nontype_template_param<4, 8>(List, Length);
 
 /* expected-error {{expected '('}} */ #pragma clang loop vectorize
 /* expected-error {{expected '('}} */ #pragma clang loop interleave
 /* expected-error {{expected '('}} */ #pragma clang loop unroll
+/* expected-error {{expected '('}} */ #pragma clang loop distribute
 
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize(enable
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave(enable
 /* expected-error {{expected ')'}} */ #pragma clang loop unroll(full
+/* expected-error {{expected ')'}} */ #pragma clang loop distribute(enable
 
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize_width(4
 /* expected-error {{expected ')'}} */ #pragma clang loop interleave_count(4
@@ -133,8 +145,9 @@ void test(int *List, int Length) {
 /* expected-error {{missing argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize()
 /* expected-error {{missing argument; expected an integer value}} */ #pragma clang loop interleave_count()
 /* expected-error {{missing argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll()
+/* expected-error {{missing argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute()
 
-/* expected-error {{missing option; expected vectorize, vectorize_width, interleave, interleave_count, unroll, or unroll_count}} */ #pragma clang loop
+/* expected-error {{missing option; expected vectorize, vectorize_width, interleave, interleave_count, unroll, unroll_count, or distribute}} */ #pragma clang loop
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop badkeyword
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop badkeyword(enable)
 /* expected-error {{invalid option 'badkeyword'}} */ #pragma clang loop vectorize(enable) badkeyword(4)
@@ -187,6 +200,7 @@ const int VV = 4;
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop vectorize(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(badidentifier)
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(badidentifier)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(badidentifier)
   while (i-7 < Length) {
     List[i] = i;
   }
@@ -196,6 +210,7 @@ const int VV = 4;
 /* expected-error {{expected ')'}} */ #pragma clang loop vectorize(()
 /* expected-error {{invalid argument; expected 'enable', 'assume_safety' or 'disable'}} */ #pragma clang loop interleave(*)
 /* expected-error {{invalid argument; expected 'enable', 'full' or 'disable'}} */ #pragma clang loop unroll(=)
+/* expected-error {{invalid argument; expected 'enable' or 'disable'}} */ #pragma clang loop distribute(+)
 /* expected-error {{type name requires a specifier or qualifier}} expected-error {{expected expression}} */ #pragma clang loop vectorize_width(^)
 /* expected-error {{expected expression}} expected-error {{expected expression}} */ #pragma clang loop interleave_count(/)
 /* expected-error {{expected expression}} expected-error {{expected expression}} */ #pragma clang loop unroll_count(==)
@@ -232,6 +247,8 @@ const int VV = 4;
 #pragma clang loop interleave(disable)
 /* expected-error {{duplicate directives 'unroll(disable)' and 'unroll(full)'}} */ #pragma clang loop unroll(full)
 #pragma clang loop unroll(disable)
+/* expected-error {{duplicate directives 'distribute(disable)' and 'distribute(enable)'}} */ #pragma clang loop distribute(enable)
+#pragma clang loop distribute(disable)
   while (i-9 < Length) {
     List[i] = i;
   }
diff --git a/test/Parser/pragma-pack.c b/test/Parser/pragma-pack.c
index 172a332510a7e..0859f4157ce3b 100644
--- a/test/Parser/pragma-pack.c
+++ b/test/Parser/pragma-pack.c
@@ -44,3 +44,7 @@ struct S
 #pragma pack()
   int e;
 };
+
+_Pragma("pack(push, 1)") struct PR28094 {
+  int a;
+} _Pragma("pack(pop)");
diff --git a/test/Parser/skip-function-bodies.mm b/test/Parser/skip-function-bodies.mm
index 8462f69f3ab74..e5b7b2adf8389 100644
--- a/test/Parser/skip-function-bodies.mm
+++ b/test/Parser/skip-function-bodies.mm
@@ -30,7 +30,7 @@ void J() {
 // CHECK: skip-function-bodies.mm:3:7: ClassDecl=A:3:7 (Definition) Extent=[3:1 - 14:2]
 // CHECK: skip-function-bodies.mm:4:9: ClassDecl=B:4:9 (Definition) Extent=[4:3 - 4:13]
 // CHECK: skip-function-bodies.mm:6:1: CXXAccessSpecifier=:6:1 (Definition) Extent=[6:1 - 6:8]
-// CHECK: skip-function-bodies.mm:7:3: CXXConstructor=A:7:3 Extent=[7:3 - 7:6]
+// CHECK: skip-function-bodies.mm:7:3: CXXConstructor=A:7:3 (default constructor) Extent=[7:3 - 7:6]
 // CHECK-NOT: skip-function-bodies.mm:8:12: StructDecl=C:8:12 (Definition) Extent=[8:5 - 10:6]
 // CHECK-NOT: skip-function-bodies.mm:9:12: CXXMethod=d:9:12 (Definition) Extent=[9:7 - 9:18]
 // CHECK: skip-function-bodies.mm:13:13: TypedefDecl=E:13:13 (Definition) Extent=[13:3 - 13:14]
diff --git a/test/Preprocessor/Weverything_pragma.c b/test/Preprocessor/Weverything_pragma.c
new file mode 100644
index 0000000000000..142543175458f
--- /dev/null
+++ b/test/Preprocessor/Weverything_pragma.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -Weverything   -fsyntax-only -verify %s
+
+// Test that the pragma overrides command line option -Weverythings,
+
+// a diagnostic with DefaultIgnore. This is part of a group 'unused-macro'
+// but -Weverything forces it
+#define UNUSED_MACRO1 1 // expected-warning{{macro is not used}}
+
+void foo() // expected-warning {{no previous prototype for function}}
+{
+ // A diagnostic without DefaultIgnore, and not part of a group.
+ (void) L'ab'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic warning "-Weverything" // Should not change anyhting.
+#define UNUSED_MACRO2 1 // expected-warning{{macro is not used}}
+ (void) L'cd'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic ignored "-Weverything" // Ignore warnings now.
+#define UNUSED_MACRO2 1 // no warning
+ (void) L'ef'; // no warning here
+
+#pragma clang diagnostic warning "-Weverything" // Revert back to warnings.
+#define UNUSED_MACRO3 1 // expected-warning{{macro is not used}}
+ (void) L'gh'; // expected-warning {{extraneous characters in character constant ignored}}
+
+#pragma clang diagnostic error "-Weverything"  // Give errors now.
+#define UNUSED_MACRO4 1 // expected-error{{macro is not used}}
+ (void) L'ij'; // expected-error {{extraneous characters in character constant ignored}}
+}
diff --git a/test/Preprocessor/aarch64-target-features.c b/test/Preprocessor/aarch64-target-features.c
index dbc29cff292b8..5ec9bc685d767 100644
--- a/test/Preprocessor/aarch64-target-features.c
+++ b/test/Preprocessor/aarch64-target-features.c
@@ -92,16 +92,22 @@
 // RUN: %clang -target aarch64 -mcpu=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A53 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a57 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A57 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a72 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-A72 %s
+// RUN: %clang -target aarch64 -mcpu=cortex-a73 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-CORTEX-A73 %s
 // RUN: %clang -target aarch64 -mcpu=exynos-m1 -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-M1 %s
-// CHECK-MCPU-CYCLONE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
+// RUN: %clang -target aarch64 -mcpu=kryo -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-KRYO %s
+// RUN: %clang -target aarch64 -mcpu=vulcan -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-VULCAN %s
+// CHECK-MCPU-CYCLONE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
 // CHECK-MCPU-A35: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A53: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A57: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-A72: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-CORTEX-A73: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 // CHECK-MCPU-M1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-KRYO: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
+// CHECK-MCPU-VULCAN: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto"
 
 // RUN: %clang -target x86_64-apple-macosx -arch arm64 -### -c %s 2>&1 | FileCheck --check-prefix=CHECK-ARCH-ARM64 %s
-// CHECK-ARCH-ARM64: "-target-cpu" "cyclone" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
+// CHECK-ARCH-ARM64: "-target-cpu" "cyclone" "-target-feature" "+neon" "-target-feature" "+crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
 
 // RUN: %clang -target aarch64 -march=armv8-a+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
 // RUN: %clang -target aarch64 -march=armv8-a+nofp+nosimd+nocrc+nocrypto+fp+simd+crc+crypto -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MARCH-1 %s
@@ -123,7 +129,7 @@
 // RUN: %clang -target aarch64 -mcpu=generic+Crc -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-2 %s
 // RUN: %clang -target aarch64 -mcpu=GENERIC+nocrc+CRC -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-2 %s
 // RUN: %clang -target aarch64 -mcpu=cortex-a53+noSIMD -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MCPU-3 %s
-// CHECK-MCPU-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc" "-target-feature" "-crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
+// CHECK-MCPU-1: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "-crypto" "-target-feature" "+zcm" "-target-feature" "+zcz"
 // CHECK-MCPU-2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "+neon" "-target-feature" "+crc"
 // CHECK-MCPU-3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-feature" "-neon"
 
diff --git a/test/Preprocessor/arm-acle-6.4.c b/test/Preprocessor/arm-acle-6.4.c
index 148ce6df090c2..11be2c172f3ba 100644
--- a/test/Preprocessor/arm-acle-6.4.c
+++ b/test/Preprocessor/arm-acle-6.4.c
@@ -140,6 +140,7 @@
 
 // RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
 // RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
+// RUN: %clang -target arm-none-linux-eabi -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-V7R-IDIV
 
 // CHECK-V7R-IDIV: __ARM_FEATURE_IDIV 1
 
diff --git a/test/Preprocessor/arm-acle-6.5.c b/test/Preprocessor/arm-acle-6.5.c
index 95adad9faa8e0..cc158c82cd8ab 100644
--- a/test/Preprocessor/arm-acle-6.5.c
+++ b/test/Preprocessor/arm-acle-6.5.c
@@ -49,10 +49,13 @@
 
 // CHECK-NO-FMA-NOT: __ARM_FEATURE_FMA
 
-// RUN: %clang -target armv7a-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
-// RUN: %clang -target armv7r-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv7a-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv7a-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv7r-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv7r-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
 // RUN: %clang -target armv7em-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
-// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
+// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-NO-FMA
+// RUN: %clang -target armv8-eabi -mfpu=vfpv4 -x c -E -dM %s -o - | FileCheck %s -check-prefix CHECK-FMA
 
 // CHECK-FMA: __ARM_FEATURE_FMA 1
 
diff --git a/test/Preprocessor/arm-target-features.c b/test/Preprocessor/arm-target-features.c
index ae27aa0a08b50..be235606e25a7 100644
--- a/test/Preprocessor/arm-target-features.c
+++ b/test/Preprocessor/arm-target-features.c
@@ -1,411 +1,402 @@
-// RUN: %clang -target armv8a-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V8A %s
-// CHECK-V8A: __ARMEL__ 1
-// CHECK-V8A: __ARM_ARCH 8
-// CHECK-V8A: __ARM_ARCH_8A__ 1
-// CHECK-V8A: __ARM_FEATURE_CRC32 1
-// CHECK-V8A: __ARM_FEATURE_DIRECTED_ROUNDING 1
-// CHECK-V8A: __ARM_FEATURE_NUMERIC_MAXMIN 1
-// CHECK-V8A: __ARM_FP 0xE
-// CHECK-V8A: __ARM_FP16_ARGS 1
-// CHECK-V8A: __ARM_FP16_FORMAT_IEEE 1
-
-// RUN: %clang -target armv7a-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V7 %s
-// CHECK-V7: __ARMEL__ 1
-// CHECK-V7: __ARM_ARCH 7
-// CHECK-V7: __ARM_ARCH_7A__ 1
+// RUN: %clang -target armv8a-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V8A %s
+// CHECK-V8A: #define __ARMEL__ 1
+// CHECK-V8A: #define __ARM_ARCH 8
+// CHECK-V8A: #define __ARM_ARCH_8A__ 1
+// CHECK-V8A: #define __ARM_FEATURE_CRC32 1
+// CHECK-V8A: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// CHECK-V8A: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// CHECK-V8A: #define __ARM_FP 0xE
+// CHECK-V8A: #define __ARM_FP16_ARGS 1
+// CHECK-V8A: #define __ARM_FP16_FORMAT_IEEE 1
+
+// RUN: %clang -target armv7a-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V7 %s
+// CHECK-V7: #define __ARMEL__ 1
+// CHECK-V7: #define __ARM_ARCH 7
+// CHECK-V7: #define __ARM_ARCH_7A__ 1
 // CHECK-V7-NOT: __ARM_FEATURE_CRC32
-// CHECK-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN                                   
+// CHECK-V7-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
 // CHECK-V7-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
-// CHECK-V7: __ARM_FP 0xC
+// CHECK-V7: #define __ARM_FP 0xC
 
-// RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7s -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V7S %s
-// CHECK-V7S: __ARMEL__ 1
-// CHECK-V7S: __ARM_ARCH 7
-// CHECK-V7S: __ARM_ARCH_7S__ 1
+// RUN: %clang -target x86_64-apple-macosx10.10 -arch armv7s -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V7S %s
+// CHECK-V7S: #define __ARMEL__ 1
+// CHECK-V7S: #define __ARM_ARCH 7
+// CHECK-V7S: #define __ARM_ARCH_7S__ 1
 // CHECK-V7S-NOT: __ARM_FEATURE_CRC32
 // CHECK-V7S-NOT: __ARM_FEATURE_NUMERIC_MAXMIN
 // CHECK-V7S-NOT: __ARM_FEATURE_DIRECTED_ROUNDING
-// CHECK-V7S: __ARM_FP 0xE
-
-// RUN: %clang -target armv8a -mfloat-abi=hard -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF %s
-// CHECK-V8-BAREHF: __ARMEL__ 1
-// CHECK-V8-BAREHF: __ARM_ARCH 8
-// CHECK-V8-BAREHF: __ARM_ARCH_8A__ 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_CRC32 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_DIRECTED_ROUNDING 1
-// CHECK-V8-BAREHF: __ARM_FEATURE_NUMERIC_MAXMIN 1
-// CHECK-V8-BAREHP: __ARM_FP 0xE
-// CHECK-V8-BAREHF: __ARM_NEON__ 1
-// CHECK-V8-BAREHF: __ARM_PCS_VFP 1
-// CHECK-V8-BAREHF: __VFP_FP__ 1
-
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-FP %s
+// CHECK-V7S: #define __ARM_FP 0xE
+
+// RUN: %clang -target armv8a -mfloat-abi=hard -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF %s
+// CHECK-V8-BAREHF: #define __ARMEL__ 1
+// CHECK-V8-BAREHF: #define __ARM_ARCH 8
+// CHECK-V8-BAREHF: #define __ARM_ARCH_8A__ 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_CRC32 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// CHECK-V8-BAREHF: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// CHECK-V8-BAREHP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF: #define __ARM_NEON__ 1
+// CHECK-V8-BAREHF: #define __ARM_PCS_VFP 1
+// CHECK-V8-BAREHF: #define __VFP_FP__ 1
+
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-FP %s
 // CHECK-V8-BAREHF-FP-NOT: __ARM_NEON__ 1
-// CHECK-V8-BAREHP-FP: __ARM_FP 0xE
-// CHECK-V8-BAREHF-FP: __VFP_FP__ 1
+// CHECK-V8-BAREHP-FP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF-FP: #define __VFP_FP__ 1
 
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=neon-fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
-// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8 -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
-// CHECK-V8-BAREHP-NEON-FP: __ARM_FP 0xE
-// CHECK-V8-BAREHF-NEON-FP: __ARM_NEON__ 1
-// CHECK-V8-BAREHF-NEON-FP: __VFP_FP__ 1
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=neon-fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
+// RUN: %clang -target armv8a -mfloat-abi=hard -mfpu=crypto-neon-fp-armv8 -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-BAREHF-NEON-FP %s
+// CHECK-V8-BAREHP-NEON-FP: #define __ARM_FP 0xE
+// CHECK-V8-BAREHF-NEON-FP: #define __ARM_NEON__ 1
+// CHECK-V8-BAREHF-NEON-FP: #define __VFP_FP__ 1
 
-// RUN: %clang -target armv8a -mnocrc -x c -E -dM %s | FileCheck --check-prefix=CHECK-V8-NOCRC %s
+// RUN: %clang -target armv8a -mnocrc -x c -E -dM %s | FileCheck -match-full-lines --check-prefix=CHECK-V8-NOCRC %s
 // CHECK-V8-NOCRC-NOT: __ARM_FEATURE_CRC32 1
 
 // Check that -mhwdiv works properly for armv8/thumbv8 (enabled by default).
 
-// RUN: %clang -target armv8 -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8 %s
-// ARMV8:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv8 -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8 %s
-// THUMBV8:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8-EABI %s
-// ARMV8-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv8-eabi -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8-EABI %s
-// THUMBV8-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv8 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=NONEHWDIV-ARMV8 %s
-// NONEHWDIV-ARMV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8 -mthumb -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=NONEHWDIV-THUMBV8 %s
-// NONEHWDIV-THUMBV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-ARMV8 %s
-// THUMBHWDIV-ARMV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8 -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-THUMBV8 %s
-// ARMHWDIV-THUMBV8-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target armv8a -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8A %s
-// ARMV8A:#define __ARM_ARCH_EXT_IDIV__ 1
-// ARMV8A: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8A %s
-// THUMBV8A:#define __ARM_ARCH_EXT_IDIV__ 1
-// THUMBV8A: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=ARMV8A-EABI %s
-// ARMV8A-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-// ARMV8A-EABI: #define __ARM_FP 0xE
-
-// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBV8A-EABI %s
-// THUMBV8A-EABI:#define __ARM_ARCH_EXT_IDIV__ 1
-// THUMBV8A-EABI: #define __ARM_FP 0xE
-
-// RUN: %clang -target arm-none-linux-gnu -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DEFS %s
+// RUN: %clang -target armv8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8 -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// RUN: %clang -target armv8-eabi -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8 %s
+// V8:#define __ARM_ARCH_EXT_IDIV__ 1
+
+// RUN: %clang -target armv8 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mthumb -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// RUN: %clang -target armv8 -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV-V8 %s
+// NOHWDIV-V8-NOT:#define __ARM_ARCH_EXT_IDIV__
+
+// RUN: %clang -target armv8a -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// RUN: %clang -target armv8a-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8A %s
+// V8A:#define __ARM_ARCH_EXT_IDIV__ 1
+// V8A:#define __ARM_FP 0xE
+
+// RUN: %clang -target armv8m.base-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_BASELINE %s
+// V8M_BASELINE: #define __ARM_ARCH 8
+// V8M_BASELINE: #define __ARM_ARCH_8M_BASE__ 1
+// V8M_BASELINE: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_BASELINE-NOT: __ARM_ARCH_ISA_ARM
+// V8M_BASELINE: #define __ARM_ARCH_ISA_THUMB 1
+// V8M_BASELINE: #define __ARM_ARCH_PROFILE 'M'
+// V8M_BASELINE-NOT: __ARM_FEATURE_CRC32
+// V8M_BASELINE-NOT: __ARM_FEATURE_DSP
+// V8M_BASELINE-NOT: __ARM_FP 0x{{.*}}
+// V8M_BASELINE-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
+
+// RUN: %clang -target armv8m.main-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_MAINLINE %s
+// V8M_MAINLINE: #define __ARM_ARCH 8
+// V8M_MAINLINE: #define __ARM_ARCH_8M_MAIN__ 1
+// V8M_MAINLINE: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_MAINLINE-NOT: __ARM_ARCH_ISA_ARM
+// V8M_MAINLINE: #define __ARM_ARCH_ISA_THUMB 2
+// V8M_MAINLINE: #define __ARM_ARCH_PROFILE 'M'
+// V8M_MAINLINE-NOT: __ARM_FEATURE_CRC32
+// V8M_MAINLINE-NOT: __ARM_FEATURE_DSP
+// V8M_MAINLINE: #define __ARM_FP 0xE
+// V8M_MAINLINE: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+
+// RUN: %clang -target arm-none-linux-gnu -march=armv8-m.main+dsp -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=V8M_MAINLINE_DSP %s
+// V8M_MAINLINE_DSP: #define __ARM_ARCH 8
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_8M_MAIN__ 1
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_EXT_IDIV__ 1
+// V8M_MAINLINE_DSP-NOT: __ARM_ARCH_ISA_ARM
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_ISA_THUMB 2
+// V8M_MAINLINE_DSP: #define __ARM_ARCH_PROFILE 'M'
+// V8M_MAINLINE_DSP-NOT: __ARM_FEATURE_CRC32
+// V8M_MAINLINE_DSP: #define __ARM_FEATURE_DSP 1
+// V8M_MAINLINE_DSP: #define __ARM_FP 0xE
+// V8M_MAINLINE_DSP: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+
+// RUN: %clang -target arm-none-linux-gnu -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-DEFS %s
 // CHECK-DEFS:#define __ARM_PCS 1
 // CHECK-DEFS:#define __ARM_SIZEOF_MINIMAL_ENUM 4
 // CHECK-DEFS:#define __ARM_SIZEOF_WCHAR_T 4
 
 // RUN: %clang -target arm-none-linux-gnu -fno-math-errno -fno-signed-zeros\
 // RUN:        -fno-trapping-math -fassociative-math -freciprocal-math\
-// RUN:        -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-FASTMATH %s
+// RUN:        -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FASTMATH %s
 // RUN: %clang -target arm-none-linux-gnu -ffast-math -x c -E -dM %s -o -\
-// RUN:        | FileCheck --check-prefix=CHECK-FASTMATH %s
-// CHECK-FASTMATH: __ARM_FP_FAST 1
+// RUN:        | FileCheck -match-full-lines --check-prefix=CHECK-FASTMATH %s
+// CHECK-FASTMATH: #define __ARM_FP_FAST 1
 
-// RUN: %clang -target arm-none-linux-gnu -fshort-wchar -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SHORTWCHAR %s
+// RUN: %clang -target arm-none-linux-gnu -fshort-wchar -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-SHORTWCHAR %s
 // CHECK-SHORTWCHAR:#define __ARM_SIZEOF_WCHAR_T 2
 
-// RUN: %clang -target arm-none-linux-gnu -fshort-enums -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SHORTENUMS %s
+// RUN: %clang -target arm-none-linux-gnu -fshort-enums -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-SHORTENUMS %s
 // CHECK-SHORTENUMS:#define __ARM_SIZEOF_MINIMAL_ENUM 1
 
 // Test that -mhwdiv has the right effect for a target CPU which has hwdiv enabled by default.
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-ARM %s
-// DEFAULTHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-THUMB %s
-// DEFAULTHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-ARM %s
-// ARMHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=HWDIV %s
+// HWDIV:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-THUMB %s
-// THUMBHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-
-// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-THUMBHWDIV-ARM %s
-// DEFAULTHWDIV-THUMBHWDIV-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-ARMHWDIV-THUMB %s
-// DEFAULTHWDIV-ARMHWDIV-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-NONEHWDIV-ARM %s
-// DEFAULTHWDIV-NONEHWDIV-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-
-// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTHWDIV-NONEHWDIV-THUMB %s
-// DEFAULTHWDIV-NONEHWDIV-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
+// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// RUN: %clang -target arm -mthumb -mcpu=cortex-a15 -mhwdiv=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NOHWDIV %s
+// NOHWDIV-NOT:#define __ARM_ARCH_EXT_IDIV__
 
 
 // Check that -mfpu works properly for Cortex-A7 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A7 %s
 // DEFAULTFPU-A7:#define __ARM_FP 0xE
 // DEFAULTFPU-A7:#define __ARM_NEON__ 1
 // DEFAULTFPU-A7:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A7 %s
 // FPUNONE-A7-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A7-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A7-NOT:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A7 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A7 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a7 -mfpu=vfp4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A7 %s
 // NONEON-A7:#define __ARM_FP 0xE
 // NONEON-A7-NOT:#define __ARM_NEON__ 1
 // NONEON-A7:#define __ARM_VFPV4__ 1
 
 // Check that -mfpu works properly for Cortex-A5 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A5 %s
 // DEFAULTFPU-A5:#define __ARM_FP 0xE
 // DEFAULTFPU-A5:#define __ARM_NEON__ 1
 // DEFAULTFPU-A5:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A5 %s
 // FPUNONE-A5-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A5-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A5-NOT:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A5 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck --check-prefix=NONEON-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A5 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a5 -mfpu=vfp4-d16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=NONEON-A5 %s
 // NONEON-A5:#define __ARM_FP 0xE
 // NONEON-A5-NOT:#define __ARM_NEON__ 1
 // NONEON-A5:#define __ARM_VFPV4__ 1
 
 // FIXME: add check for further predefines
 // Test whether predefines are as expected when targeting ep9312.
-// RUN: %clang -target armv4t -mcpu=ep9312 -x c -E -dM %s -o - | FileCheck --check-prefix=A4T %s
+// RUN: %clang -target armv4t -mcpu=ep9312 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A4T %s
 // A4T-NOT:#define __ARM_FEATURE_DSP
 // A4T-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting arm10tdmi.
-// RUN: %clang -target armv5 -mcpu=arm10tdmi -x c -E -dM %s -o - | FileCheck --check-prefix=A5T %s
+// RUN: %clang -target armv5 -mcpu=arm10tdmi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5T %s
 // A5T-NOT:#define __ARM_FEATURE_DSP
 // A5T-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-a5.
-// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5-ARM %s
-// A5-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A5-ARM:#define __ARM_FEATURE_DSP
-// A5-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5-THUMB %s
-// A5-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A5-THUMB:#define __ARM_FEATURE_DSP
-// A5-THUMB:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck --check-prefix=A5 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A5 %s
 // A5:#define __ARM_ARCH 7
 // A5:#define __ARM_ARCH_7A__ 1
+// A5-NOT:#define __ARM_ARCH_EXT_IDIV__
 // A5:#define __ARM_ARCH_PROFILE 'A'
-// A5-NOT: #define __ARM_FEATURE_NUMERIC_MAXMIN
 // A5-NOT: #define __ARM_FEATURE_DIRECTED_ROUNDING
-// A5:#define __ARM_FEATURE_DSP
+// A5:#define __ARM_FEATURE_DSP 1
+// A5-NOT: #define __ARM_FEATURE_NUMERIC_MAXMIN
 // A5:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-a7.
-// RUN: %clang -target armv7k -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=A7 %s
-// RUN: %clang -target armv7k -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck --check-prefix=A7 %s
+// RUN: %clang -target armv7k -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A7 %s
+// RUN: %clang -target armv7k -mthumb -mcpu=cortex-a7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A7 %s
 // A7:#define __ARM_ARCH 7
 // A7:#define __ARM_ARCH_EXT_IDIV__ 1
 // A7:#define __ARM_ARCH_PROFILE 'A'
-// A7:#define __ARM_FEATURE_DSP
+// A7:#define __ARM_FEATURE_DSP 1
 // A7:#define __ARM_FP 0xE
 
-// Test whether predefines are as expected when targeting cortex-a8.
-// RUN: %clang -target armv7 -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck --check-prefix=A8-ARM %s
-// A8-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A8-ARM:#define __ARM_FEATURE_DSP
-// A8-ARM:#define __ARM_FP 0xC
+// Test whether predefines are as expected when targeting cortex-a7.
+// RUN: %clang -target x86_64-apple-darwin -arch armv7k -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV7K %s
+// ARMV7K:#define __ARM_ARCH 7
+// ARMV7K:#define __ARM_ARCH_EXT_IDIV__ 1
+// ARMV7K:#define __ARM_ARCH_PROFILE 'A'
+// ARMV7K:#define __ARM_DWARF_EH__ 1
+// ARMV7K:#define __ARM_FEATURE_DSP 1
+// ARMV7K:#define __ARM_FP 0xE
+// ARMV7K:#define __ARM_PCS_VFP 1
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck --check-prefix=A8-THUMB %s
-// A8-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A8-THUMB:#define __ARM_FEATURE_DSP
-// A8-THUMB:#define __ARM_FP 0xC
 
-// Test whether predefines are as expected when targeting cortex-a9.
-// RUN: %clang -target armv7 -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck --check-prefix=A9-ARM %s
-// A9-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A9-ARM:#define __ARM_FEATURE_DSP
-// A9-ARM:#define __ARM_FP 0xE
+// Test whether predefines are as expected when targeting cortex-a8.
+// RUN: %clang -target armv7 -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A8 %s
+// A8-NOT:#define __ARM_ARCH_EXT_IDIV__
+// A8:#define __ARM_FEATURE_DSP 1
+// A8:#define __ARM_FP 0xC
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck --check-prefix=A9-THUMB %s
-// A9-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
-// A9-THUMB:#define __ARM_FEATURE_DSP
-// A9-THUMB:#define __ARM_FP 0xE
+// Test whether predefines are as expected when targeting cortex-a9.
+// RUN: %clang -target armv7 -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A9 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a9 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A9 %s
+// A9-NOT:#define __ARM_ARCH_EXT_IDIV__
+// A9:#define __ARM_FEATURE_DSP 1
+// A9:#define __ARM_FP 0xE
 
 
 // Check that -mfpu works properly for Cortex-A12 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A12 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A12 %s
 // DEFAULTFPU-A12:#define __ARM_FP 0xE
 // DEFAULTFPU-A12:#define __ARM_NEON__ 1
 // DEFAULTFPU-A12:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A12 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A12 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a12 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A12 %s
 // FPUNONE-A12-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A12-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A12-NOT:#define __ARM_VFPV4__ 1
 
 // Test whether predefines are as expected when targeting cortex-a12.
-// RUN: %clang -target armv7 -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=A12 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck --check-prefix=A12 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A12 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a12 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A12 %s
 // A12:#define __ARM_ARCH 7
 // A12:#define __ARM_ARCH_7A__ 1
 // A12:#define __ARM_ARCH_EXT_IDIV__ 1
 // A12:#define __ARM_ARCH_PROFILE 'A'
-// A12:#define __ARM_FEATURE_DSP
+// A12:#define __ARM_FEATURE_DSP 1
 // A12:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-a15.
-// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=A15-ARM %s
-// A15-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// A15-ARM:#define __ARM_FEATURE_DSP
-// A15-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck --check-prefix=A15-THUMB %s
-// A15-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// A15-THUMB:#define __ARM_FEATURE_DSP
-// A15-THUMB:#define __ARM_FP 0xE
+// RUN: %clang -target armv7 -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A15 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a15 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A15 %s
+// A15:#define __ARM_ARCH_EXT_IDIV__ 1
+// A15:#define __ARM_FEATURE_DSP 1
+// A15:#define __ARM_FP 0xE
 
 // Check that -mfpu works properly for Cortex-A17 (enabled by default).
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A17 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=DEFAULTFPU-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=DEFAULTFPU-A17 %s
 // DEFAULTFPU-A17:#define __ARM_FP 0xE
 // DEFAULTFPU-A17:#define __ARM_NEON__ 1
 // DEFAULTFPU-A17:#define __ARM_VFPV4__ 1
 
-// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A17 %s
-// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck --check-prefix=FPUNONE-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A17 %s
+// RUN: %clang -target armv7-none-linux-gnueabi -mthumb -mcpu=cortex-a17 -mfpu=none -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=FPUNONE-A17 %s
 // FPUNONE-A17-NOT:#define __ARM_FP 0x{{.*}}
 // FPUNONE-A17-NOT:#define __ARM_NEON__ 1
 // FPUNONE-A17-NOT:#define __ARM_VFPV4__ 1
 
 // Test whether predefines are as expected when targeting cortex-a17.
-// RUN: %clang -target armv7 -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=A17 %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck --check-prefix=A17 %s
+// RUN: %clang -target armv7 -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A17 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-a17 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=A17 %s
 // A17:#define __ARM_ARCH 7
 // A17:#define __ARM_ARCH_7A__ 1
 // A17:#define __ARM_ARCH_EXT_IDIV__ 1
 // A17:#define __ARM_ARCH_PROFILE 'A'
-// A17:#define __ARM_FEATURE_DSP
+// A17:#define __ARM_FEATURE_DSP 1
 // A17:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting swift.
-// RUN: %clang -target armv7s -mcpu=swift -x c -E -dM %s -o - | FileCheck --check-prefix=SWIFT-ARM %s
-// SWIFT-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// SWIFT-ARM:#define __ARM_FEATURE_DSP
-// SWIFT-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7s -mthumb -mcpu=swift -x c -E -dM %s -o - | FileCheck --check-prefix=SWIFT-THUMB %s
-// SWIFT-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// SWIFT-THUMB:#define __ARM_FEATURE_DSP
-// SWIFT-THUMB:#define __ARM_FP 0xE
-
-// Test whether predefines are as expected when targeting cortex-a53.
-// RUN: %clang -target armv8 -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck --check-prefix=A53-ARM %s
-// A53-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// A53-ARM:#define __ARM_FEATURE_DSP
-// A53-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck --check-prefix=A53-THUMB %s
-// A53-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// A53-THUMB:#define __ARM_FEATURE_DSP
-// A53-THUMB:#define __ARM_FP 0xE
+// RUN: %clang -target armv7s -mcpu=swift -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=SWIFT %s
+// RUN: %clang -target armv7s -mthumb -mcpu=swift -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=SWIFT %s
+// SWIFT:#define __ARM_ARCH_EXT_IDIV__ 1
+// SWIFT:#define __ARM_FEATURE_DSP 1
+// SWIFT:#define __ARM_FP 0xE
+
+// Test whether predefines are as expected when targeting ARMv8-A Cortex implementations
+// RUN: %clang -target armv8 -mcpu=cortex-a32 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a32 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a35 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a35 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a53 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a57 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a57 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a72 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a72 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mcpu=cortex-a73 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// RUN: %clang -target armv8 -mthumb -mcpu=cortex-a73 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMV8 %s
+// ARMV8:#define __ARM_ARCH_EXT_IDIV__ 1
+// ARMV8:#define __ARM_FEATURE_DSP 1
+// ARMV8:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-r4.
-// RUN: %clang -target armv7 -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck --check-prefix=R4-ARM %s
+// RUN: %clang -target armv7 -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4-ARM %s
 // R4-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// R4-ARM:#define __ARM_FEATURE_DSP
+// R4-ARM:#define __ARM_FEATURE_DSP 1
 // R4-ARM-NOT:#define __ARM_FP 0x{{.*}}
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck --check-prefix=R4-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4-THUMB %s
 // R4-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R4-THUMB:#define __ARM_FEATURE_DSP
+// R4-THUMB:#define __ARM_FEATURE_DSP 1
 // R4-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-r4f.
-// RUN: %clang -target armv7 -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck --check-prefix=R4F-ARM %s
+// RUN: %clang -target armv7 -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4F-ARM %s
 // R4F-ARM-NOT:#define __ARM_ARCH_EXT_IDIV__
-// R4F-ARM:#define __ARM_FEATURE_DSP
+// R4F-ARM:#define __ARM_FEATURE_DSP 1
 // R4F-ARM:#define __ARM_FP 0xC
 
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck --check-prefix=R4F-THUMB %s
-// R4F-THUMBT:#define __ARM_ARCH_EXT_IDIV__ 1
-// R4F-THUMB:#define __ARM_FEATURE_DSP
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r4f -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R4F-THUMB %s
+// R4F-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
+// R4F-THUMB:#define __ARM_FEATURE_DSP 1
 // R4F-THUMB:#define __ARM_FP 0xC
 
 // Test whether predefines are as expected when targeting cortex-r5.
-// RUN: %clang -target armv7 -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck --check-prefix=R5-ARM %s
-// R5-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// R5-ARM:#define __ARM_FEATURE_DSP
-// R5-ARM:#define __ARM_FP 0xC
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck --check-prefix=R5-THUMB %s
-// R5-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R5-THUMB:#define __ARM_FEATURE_DSP
-// R5-THUMB:#define __ARM_FP 0xC
-
-// Test whether predefines are as expected when targeting cortex-r7.
-// RUN: %clang -target armv7 -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck --check-prefix=R7-ARM %s
-// R7-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// R7-ARM:#define __ARM_FEATURE_DSP
-// R7-ARM:#define __ARM_FP 0xE
-
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck --check-prefix=R7-THUMB %s
-// R7-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// R7-THUMB:#define __ARM_FEATURE_DSP
-// R7-THUMB:#define __ARM_FP 0xE
+// RUN: %clang -target armv7 -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R5 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r5 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R5 %s
+// R5:#define __ARM_ARCH_EXT_IDIV__ 1
+// R5:#define __ARM_FEATURE_DSP 1
+// R5:#define __ARM_FP 0xC
+
+// Test whether predefines are as expected when targeting cortex-r7 and cortex-r8.
+// RUN: %clang -target armv7 -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-r8 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=R7-R8 %s
+// R7-R8:#define __ARM_ARCH_EXT_IDIV__ 1
+// R7-R8:#define __ARM_FEATURE_DSP 1
+// R7-R8:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting cortex-m0.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0plus -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m1 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=sc000 -x c -E -dM %s -o - | FileCheck --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m0plus -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m1 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=sc000 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M0-THUMB %s
 // M0-THUMB-NOT:#define __ARM_ARCH_EXT_IDIV__
 // M0-THUMB-NOT:#define __ARM_FEATURE_DSP
 // M0-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-m3.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m3 -x c -E -dM %s -o - | FileCheck --check-prefix=M3-THUMB %s
-// RUN: %clang -target armv7 -mthumb -mcpu=sc300 -x c -E -dM %s -o - | FileCheck --check-prefix=M3-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m3 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M3-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=sc300 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M3-THUMB %s
 // M3-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
 // M3-THUMB-NOT:#define __ARM_FEATURE_DSP
 // M3-THUMB-NOT:#define __ARM_FP 0x{{.*}}
 
 // Test whether predefines are as expected when targeting cortex-m4.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m4 -x c -E -dM %s -o - | FileCheck --check-prefix=M4-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m4 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M4-THUMB %s
 // M4-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// M4-THUMB:#define __ARM_FEATURE_DSP
+// M4-THUMB:#define __ARM_FEATURE_DSP 1
 // M4-THUMB:#define __ARM_FP 0x6
 
 // Test whether predefines are as expected when targeting cortex-m7.
-// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m7 -x c -E -dM %s -o - | FileCheck --check-prefix=M7-THUMB %s
+// RUN: %clang -target armv7 -mthumb -mcpu=cortex-m7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=M7-THUMB %s
 // M7-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// M7-THUMB:#define __ARM_FEATURE_DSP
+// M7-THUMB:#define __ARM_FEATURE_DSP 1
 // M7-THUMB:#define __ARM_FP 0xE
 
 // Test whether predefines are as expected when targeting krait.
-// RUN: %clang -target armv7 -mcpu=krait -x c -E -dM %s -o - | FileCheck --check-prefix=KRAIT-ARM %s
-// KRAIT-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
-// KRAIT-ARM:#define __ARM_FEATURE_DSP
-// KRAIT-ARM:#define  __ARM_VFPV4__ 1
-
-// RUN: %clang -target armv7 -mthumb -mcpu=krait -x c -E -dM %s -o - | FileCheck --check-prefix=KRAIT-THUMB %s
-// KRAIT-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
-// KRAIT-THUMB:#define __ARM_FEATURE_DSP
-// KRAIT-THUMB:#define  __ARM_VFPV4__ 1
-
-// RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-V81A %s
-// CHECK-V81A: __ARM_ARCH 8
-// CHECK-V81A: __ARM_ARCH_8_1A__ 1
+// RUN: %clang -target armv7 -mcpu=krait -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=KRAIT %s
+// RUN: %clang -target armv7 -mthumb -mcpu=krait -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=KRAIT %s
+// KRAIT:#define __ARM_ARCH_EXT_IDIV__ 1
+// KRAIT:#define __ARM_FEATURE_DSP 1
+// KRAIT:#define  __ARM_VFPV4__ 1
+
+// RUN: %clang -target armv8.1a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V81A %s
+// CHECK-V81A: #define __ARM_ARCH 8
+// CHECK-V81A: #define __ARM_ARCH_8_1A__ 1
 // CHECK-V81A: #define __ARM_ARCH_PROFILE 'A'
-// CHECK-V81A: __ARM_FEATURE_QRDMX 1
+// CHECK-V81A: #define __ARM_FEATURE_QRDMX 1
 // CHECK-V81A: #define __ARM_FP 0xE
+
+// RUN: %clang -target armv8.2a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V82A %s
+// CHECK-V82A: #define __ARM_ARCH 8
+// CHECK-V82A: #define __ARM_ARCH_8_2A__ 1
+// CHECK-V82A: #define __ARM_ARCH_PROFILE 'A'
+// CHECK-V82A: #define __ARM_FP 0xE
diff --git a/test/Preprocessor/bigoutput.c b/test/Preprocessor/bigoutput.c
new file mode 100644
index 0000000000000..c5e02cb9ddf66
--- /dev/null
+++ b/test/Preprocessor/bigoutput.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -E -x c %s > /dev/tty
+// The original bug requires UNIX line endings to trigger.
+// The original bug triggers only when outputting directly to console.
+// REQUIRES: console
+
+// Make sure clang does not crash during preprocessing
+
+#define M0 extern int x;
+#define M2  M0  M0  M0  M0
+#define M4  M2  M2  M2  M2
+#define M6  M4  M4  M4  M4
+#define M8  M6  M6  M6  M6
+#define M10 M8  M8  M8  M8
+#define M12 M10 M10 M10 M10
+#define M14 M12 M12 M12 M12
+
+M14
diff --git a/test/Preprocessor/comment_save_macro.c b/test/Preprocessor/comment_save_macro.c
index 6ad759f5c31a2..f32ba5629b13f 100644
--- a/test/Preprocessor/comment_save_macro.c
+++ b/test/Preprocessor/comment_save_macro.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -E -CC %s | FileCheck -check-prefix=CHECK-CC -strict-whitespace %s
 // CHECK-CC: boo bork /* blah*/ bar // zot
 
-// RUN: %clang_cc1 -E %s | FileCheck -check-prefix=CHECK -strict-whitespace %s
+// RUN: %clang_cc1 -E %s | FileCheck -strict-whitespace %s
 // CHECK: boo bork bar
 
 
diff --git a/test/Preprocessor/cuda-approx-transcendentals.cu b/test/Preprocessor/cuda-approx-transcendentals.cu
new file mode 100644
index 0000000000000..8d106ea27a7fa
--- /dev/null
+++ b/test/Preprocessor/cuda-approx-transcendentals.cu
@@ -0,0 +1,8 @@
+// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix HOST %s
+// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-NOFAST %s
+// RUN: %clang -fcuda-approx-transcendentals --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+// RUN: %clang -ffast-math --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null | FileCheck --check-prefix DEVICE-FAST %s
+
+// HOST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-NOFAST-NOT: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
+// DEVICE-FAST: __CLANG_CUDA_APPROX_TRANSCENDENTALS__
diff --git a/test/Preprocessor/cuda-preprocess.cu b/test/Preprocessor/cuda-preprocess.cu
new file mode 100644
index 0000000000000..9751bfd6d5b38
--- /dev/null
+++ b/test/Preprocessor/cuda-preprocess.cu
@@ -0,0 +1,32 @@
+// Tests CUDA compilation with -E.
+
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+
+#ifndef __CUDA_ARCH__
+#define PREPROCESSED_AWAY
+clang_unittest_no_arch PREPROCESSED_AWAY
+#else
+clang_unittest_cuda_arch __CUDA_ARCH__
+#endif
+
+// CHECK-NOT: PREPROCESSED_AWAY
+
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 -nocudainc %s 2>&1 \
+// RUN:   | FileCheck -check-prefix NOARCH %s
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-host-only -nocudainc %s 2>&1 \
+// RUN:   | FileCheck -check-prefix NOARCH %s
+// NOARCH: clang_unittest_no_arch
+
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-device-only -nocudainc %s 2>&1 \
+// RUN:   | FileCheck -check-prefix SM20 %s
+// SM20: clang_unittest_cuda_arch 200
+
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_30 --cuda-device-only -nocudainc %s 2>&1 \
+// RUN:   | FileCheck -check-prefix SM30 %s
+// SM30: clang_unittest_cuda_arch 300
+
+// RUN: %clang -E -target x86_64-linux-gnu --cuda-gpu-arch=sm_20 --cuda-gpu-arch=sm_30 \
+// RUN:   --cuda-device-only -nocudainc %s 2>&1 \
+// RUN:   | FileCheck -check-prefix SM20 -check-prefix SM30 %s
diff --git a/test/Preprocessor/cuda-types.cu b/test/Preprocessor/cuda-types.cu
new file mode 100644
index 0000000000000..dd8eef4aaef51
--- /dev/null
+++ b/test/Preprocessor/cuda-types.cu
@@ -0,0 +1,27 @@
+// Check that types, widths, etc. match on the host and device sides of CUDA
+// compilations.  Note that we filter out long double, as this is intentionally
+// different on host and device.
+
+// RUN: %clang --cuda-host-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/i386-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -target i386-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/i386-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)' %T/i386-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF|WIDTH\)' %T/i386-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/i386-device-defines-filtered
+// RUN: diff %T/i386-host-defines-filtered %T/i386-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/x86_64-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -target x86_64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/x86_64-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/x86_64-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/x86_64-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/x86_64-device-defines-filtered
+// RUN: diff %T/x86_64-host-defines-filtered %T/x86_64-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/powerpc64-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -target powerpc64-unknown-linux-gnu -x cuda -E -dM -o - /dev/null > %T/powerpc64-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/powerpc64-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/powerpc64-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/powerpc64-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/powerpc64-device-defines-filtered
+// RUN: diff %T/powerpc64-host-defines-filtered %T/powerpc64-device-defines-filtered
+
+// RUN: %clang --cuda-host-only -nocudainc -target nvptx-nvidia-cuda -x cuda -E -dM -o - /dev/null > %T/nvptx-host-defines
+// RUN: %clang --cuda-device-only -nocudainc -target nvptx-nvidia-cuda -x cuda -E -dM -o - /dev/null > %T/nvptx-device-defines
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/nvptx-host-defines   | grep -v '__LDBL\|_LONG_DOUBLE' > %T/nvptx-host-defines-filtered
+// RUN: grep 'define __[^ ]*\(TYPE\|MAX\|SIZEOF\|WIDTH\)' %T/nvptx-device-defines | grep -v '__LDBL\|_LONG_DOUBLE' > %T/nvptx-device-defines-filtered
+// RUN: diff %T/nvptx-host-defines-filtered %T/nvptx-device-defines-filtered
diff --git a/test/Preprocessor/elfiamcu-predefines.c b/test/Preprocessor/elfiamcu-predefines.c
index af5e40e3c8ab5..ea6824b77e1cb 100644
--- a/test/Preprocessor/elfiamcu-predefines.c
+++ b/test/Preprocessor/elfiamcu-predefines.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -E -dM -triple i586-intel-elfiamcu | FileCheck %s
 
+// CHECK: #define __USER_LABEL_PREFIX__ {{$}}
+// CHECK: #define __WINT_TYPE__ unsigned int
 // CHECK: #define __iamcu
 // CHECK: #define __iamcu__
 
diff --git a/test/Preprocessor/expr_define_expansion.c b/test/Preprocessor/expr_define_expansion.c
index 3e5a2c4b0e677..23cb4355eb8c6 100644
--- a/test/Preprocessor/expr_define_expansion.c
+++ b/test/Preprocessor/expr_define_expansion.c
@@ -1,6 +1,28 @@
-// RUN: %clang_cc1 %s -E -CC -pedantic -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 %s -E -CC -verify
+// RUN: %clang_cc1 %s -E -CC -DPEDANTIC -pedantic -verify
 
 #define FOO && 1
 #if defined FOO FOO
 #endif
+
+#define A
+#define B defined(A)
+#if B // expected-warning{{macro expansion producing 'defined' has undefined behavior}}
+#endif
+
+#define m_foo
+#define TEST(a) (defined(m_##a) && a)
+
+#if defined(PEDANTIC)
+// expected-warning@+4{{macro expansion producing 'defined' has undefined behavior}}
+#endif
+
+// This shouldn't warn by default, only with pedantic:
+#if TEST(foo)
+#endif
+
+
+// Only one diagnostic for this case:
+#define INVALID defined(
+#if INVALID // expected-error{{macro name missing}}
+#endif
diff --git a/test/Preprocessor/expr_invalid_tok.c b/test/Preprocessor/expr_invalid_tok.c
index 5defcc5bfbbd7..0b97b255f11b5 100644
--- a/test/Preprocessor/expr_invalid_tok.c
+++ b/test/Preprocessor/expr_invalid_tok.c
@@ -1,15 +1,28 @@
-// RUN: not %clang_cc1 -E %s 2>&1 | grep 'invalid token at start of a preprocessor expression'
-// RUN: not %clang_cc1 -E %s 2>&1 | grep 'token is not a valid binary operator in a preprocessor subexpression'
-// RUN: not %clang_cc1 -E %s 2>&1 | grep ':14: error: expected end of line in preprocessor expression'
+// RUN: not %clang_cc1 -E %s 2>&1 | FileCheck %s
 // PR2220
 
+// CHECK: invalid token at start of a preprocessor expression
 #if 1 * * 2
 #endif
 
+// CHECK: token is not a valid binary operator in a preprocessor subexpression
 #if 4 [ 2
 #endif
 
 
 // PR2284 - The constant-expr production does not including comma.
+// CHECK: [[@LINE+1]]:14: error: expected end of line in preprocessor expression
 #if 1 ? 2 : 0, 1
 #endif
+
+// CHECK: [[@LINE+1]]:5: error: function-like macro 'FOO' is not defined
+#if FOO(1, 2, 3)
+#endif
+
+// CHECK: [[@LINE+1]]:9: error: function-like macro 'BAR' is not defined
+#if 1 + BAR(1, 2, 3)
+#endif
+
+// CHECK: [[@LINE+1]]:10: error: token is not a valid binary operator
+#if (FOO)(1, 2, 3)
+#endif
diff --git a/test/Preprocessor/feature_tests.c b/test/Preprocessor/feature_tests.c
index fbde6a65476ed..52a1f17cdd4ff 100644
--- a/test/Preprocessor/feature_tests.c
+++ b/test/Preprocessor/feature_tests.c
@@ -55,8 +55,50 @@
 #endif
 
 #ifdef VERIFY
-// expected-error@+2 {{builtin feature check macro requires a parenthesized identifier}}
-// expected-error@+1 {{expected value in expression}}
+// expected-error@+1 {{builtin feature check macro requires a parenthesized identifier}}
 #if __has_feature('x')
 #endif
+
+// The following are not identifiers:
+_Static_assert(!__is_identifier("string"), "oops");
+_Static_assert(!__is_identifier('c'), "oops");
+_Static_assert(!__is_identifier(123), "oops");
+_Static_assert(!__is_identifier(int), "oops");
+
+// The following are:
+_Static_assert(__is_identifier(abc /* comment */), "oops");
+_Static_assert(__is_identifier /* comment */ (xyz), "oops");
+
+// expected-error@+1 {{too few arguments}}
+#if __is_identifier()
+#endif
+
+// expected-error@+1 {{too many arguments}}
+#if __is_identifier(,())
+#endif
+
+// expected-error@+1 {{missing ')' after 'abc'}} 
+#if __is_identifier(abc xyz) // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{missing ')' after 'abc'}} 
+#if __is_identifier(abc())   // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{missing ')' after '.'}} 
+#if __is_identifier(.abc)    // expected-note {{to match this '('}}
+#endif
+
+// expected-error@+1 {{nested parentheses not permitted in '__is_identifier'}} 
+#if __is_identifier((abc))
+#endif
+
+// expected-error@+1 {{missing '(' after '__is_identifier'}} expected-error@+1 {{expected value}}
+#if __is_identifier
+#endif
+
+// expected-error@+1 {{unterminated}} expected-error@+1 {{expected value}}
+#if __is_identifier(
+#endif
+
 #endif
diff --git a/test/Preprocessor/has_attribute.c b/test/Preprocessor/has_attribute.c
index 1a3c2a0e18c03..4970dc5904230 100644
--- a/test/Preprocessor/has_attribute.c
+++ b/test/Preprocessor/has_attribute.c
@@ -54,5 +54,5 @@ int has_no_volatile_attribute();
   int does_not_have_uuid
 #endif
 
-#if __has_cpp_attribute(selectany) // expected-error {{token is not a valid binary operator in a preprocessor subexpression}}
+#if __has_cpp_attribute(selectany) // expected-error {{function-like macro '__has_cpp_attribute' is not defined}}
 #endif
diff --git a/test/Preprocessor/has_attribute.cpp b/test/Preprocessor/has_attribute.cpp
index 1ab45020b4443..2cfa005fb09fa 100644
--- a/test/Preprocessor/has_attribute.cpp
+++ b/test/Preprocessor/has_attribute.cpp
@@ -52,6 +52,16 @@
   int has_cxx14_deprecated_vers();
 #endif
 
+// CHECK: has_cxx1z_nodiscard
+#if __has_cpp_attribute(nodiscard) == 201603
+  int has_cxx1z_nodiscard();
+#endif
+
+// CHECK: has_cxx1z_fallthrough
+#if __has_cpp_attribute(fallthrough) == 201603
+  int has_cxx1z_fallthrough();
+#endif
+
 // CHECK: has_declspec_uuid
 #if __has_declspec_attribute(uuid)
   int has_declspec_uuid();
diff --git a/test/Preprocessor/hexagon-predefines.c b/test/Preprocessor/hexagon-predefines.c
new file mode 100644
index 0000000000000..065ecc069422c
--- /dev/null
+++ b/test/Preprocessor/hexagon-predefines.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv5 %s | FileCheck %s -check-prefix CHECK-V5
+
+// CHECK-V5: #define __HEXAGON_ARCH__ 5
+// CHECK-V5: #define __HEXAGON_V5__ 1
+// CHECK-V5: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv55 %s | FileCheck %s -check-prefix CHECK-V55
+
+// CHECK-V55: #define __HEXAGON_ARCH__ 55
+// CHECK-V55: #define __HEXAGON_V55__ 1
+// CHECK-V55: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 %s | FileCheck %s -check-prefix CHECK-V60
+
+// CHECK-V60: #define __HEXAGON_ARCH__ 60
+// CHECK-V60: #define __HEXAGON_V60__ 1
+// CHECK-V60: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 -target-feature +hvx %s | FileCheck %s -check-prefix CHECK-V60HVX
+
+// CHECK-V60HVX: #define __HEXAGON_ARCH__ 60
+// CHECK-V60HVX: #define __HEXAGON_V60__ 1
+// CHECK-V60HVX: #define __HVX__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv60 -target-feature +hvx-double  %s | FileCheck %s -check-prefix CHECK-V60HVXD
+
+// CHECK-V60HVXD: #define __HEXAGON_ARCH__ 60
+// CHECK-V60HVXD: #define __HEXAGON_V60__ 1
+// CHECK-V60HVXD: #define __HVXDBL__ 1
+// CHECK-V60HVXD: #define __HVX__ 1
+// CHECK-V60HVXD: #define __hexagon__ 1
+
diff --git a/test/Preprocessor/init.c b/test/Preprocessor/init.c
index 099e58561699b..f7c320b7226b8 100644
--- a/test/Preprocessor/init.c
+++ b/test/Preprocessor/init.c
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -check-prefix ASM %s
+// RUN: %clang_cc1 -E -dM -x assembler-with-cpp < /dev/null | FileCheck -match-full-lines -check-prefix ASM %s
 //
 // ASM:#define __ASSEMBLER__ 1
 //
 //
-// RUN: %clang_cc1 -fblocks -E -dM < /dev/null | FileCheck -check-prefix BLOCKS %s
+// RUN: %clang_cc1 -fblocks -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix BLOCKS %s
 //
 // BLOCKS:#define __BLOCKS__ 1
 // BLOCKS:#define __block __attribute__((__blocks__(byref)))
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++1z -E -dM < /dev/null | FileCheck -check-prefix CXX1Z %s
+// RUN: %clang_cc1 -x c++ -std=c++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX1Z %s
 //
-// CXX1Z:#define __GNUG__
+// CXX1Z:#define __GNUG__ {{.*}}
 // CXX1Z:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX1Z:#define __GXX_RTTI 1
 // CXX1Z:#define __GXX_WEAK__ 1
@@ -19,9 +19,9 @@
 // CXX1Z:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++1y -E -dM < /dev/null | FileCheck -check-prefix CXX1Y %s
+// RUN: %clang_cc1 -x c++ -std=c++1y -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX1Y %s
 //
-// CXX1Y:#define __GNUG__
+// CXX1Y:#define __GNUG__ {{.*}}
 // CXX1Y:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX1Y:#define __GXX_RTTI 1
 // CXX1Y:#define __GXX_WEAK__ 1
@@ -29,9 +29,9 @@
 // CXX1Y:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=c++11 -E -dM < /dev/null | FileCheck -check-prefix CXX11 %s
+// RUN: %clang_cc1 -x c++ -std=c++11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX11 %s
 //
-// CXX11:#define __GNUG__
+// CXX11:#define __GNUG__ {{.*}}
 // CXX11:#define __GXX_EXPERIMENTAL_CXX0X__ 1
 // CXX11:#define __GXX_RTTI 1
 // CXX11:#define __GXX_WEAK__ 1
@@ -39,100 +39,113 @@
 // CXX11:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -x c++ -std=c++98 -E -dM < /dev/null | FileCheck -check-prefix CXX98 %s
+// RUN: %clang_cc1 -x c++ -std=c++98 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix CXX98 %s
 // 
-// CXX98:#define __GNUG__
+// CXX98:#define __GNUG__ {{.*}}
 // CXX98:#define __GXX_RTTI 1
 // CXX98:#define __GXX_WEAK__ 1
 // CXX98:#define __cplusplus 199711L
 // CXX98:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -fdeprecated-macro -E -dM < /dev/null | FileCheck -check-prefix DEPRECATED %s
+// RUN: %clang_cc1 -fdeprecated-macro -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix DEPRECATED %s
 //
 // DEPRECATED:#define __DEPRECATED 1
 //
 // 
-// RUN: %clang_cc1 -std=c99 -E -dM < /dev/null | FileCheck -check-prefix C99 %s
+// RUN: %clang_cc1 -std=c99 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C99 %s
 //
 // C99:#define __STDC_VERSION__ 199901L
 // C99:#define __STRICT_ANSI__ 1
+// C99-NOT: __GXX_EXPERIMENTAL_CXX0X__
+// C99-NOT: __GXX_RTTI
+// C99-NOT: __GXX_WEAK__
+// C99-NOT: __cplusplus
 //
 // 
-// RUN: %clang_cc1 -std=c11 -E -dM < /dev/null | FileCheck -check-prefix C11 %s
+// RUN: %clang_cc1 -std=c11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C11 %s
 //
 // C11:#define __STDC_UTF_16__ 1
 // C11:#define __STDC_UTF_32__ 1
 // C11:#define __STDC_VERSION__ 201112L
 // C11:#define __STRICT_ANSI__ 1
+// C11-NOT: __GXX_EXPERIMENTAL_CXX0X__
+// C11-NOT: __GXX_RTTI
+// C11-NOT: __GXX_WEAK__
+// C11-NOT: __cplusplus
 //
 // 
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix COMMON %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix COMMON %s
 //
 // COMMON:#define __CONSTANT_CFSTRINGS__ 1
 // COMMON:#define __FINITE_MATH_ONLY__ 0
-// COMMON:#define __GNUC_MINOR__
-// COMMON:#define __GNUC_PATCHLEVEL__
+// COMMON:#define __GNUC_MINOR__ {{.*}}
+// COMMON:#define __GNUC_PATCHLEVEL__ {{.*}}
 // COMMON:#define __GNUC_STDC_INLINE__ 1
-// COMMON:#define __GNUC__
-// COMMON:#define __GXX_ABI_VERSION
+// COMMON:#define __GNUC__ {{.*}}
+// COMMON:#define __GXX_ABI_VERSION {{.*}}
 // COMMON:#define __ORDER_BIG_ENDIAN__ 4321
 // COMMON:#define __ORDER_LITTLE_ENDIAN__ 1234
 // COMMON:#define __ORDER_PDP_ENDIAN__ 3412
 // COMMON:#define __STDC_HOSTED__ 1
-// COMMON:#define __STDC_VERSION__ 201112L
 // COMMON:#define __STDC__ 1
-// COMMON:#define __VERSION__
+// COMMON:#define __VERSION__ {{.*}}
 // COMMON:#define __clang__ 1
 // COMMON:#define __clang_major__ {{[0-9]+}}
 // COMMON:#define __clang_minor__ {{[0-9]+}}
 // COMMON:#define __clang_patchlevel__ {{[0-9]+}}
-// COMMON:#define __clang_version__
+// COMMON:#define __clang_version__ {{.*}}
 // COMMON:#define __llvm__ 1
 //
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-win32 < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=x86_64-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -triple=armv7a-apple-darwin < /dev/null | FileCheck -match-full-lines -check-prefix C-DEFAULT %s
 // 
-// RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -check-prefix FREESTANDING %s
+// C-DEFAULT:#define __STDC_VERSION__ 201112L
+//
+// RUN: %clang_cc1 -ffreestanding -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix FREESTANDING %s
 // FREESTANDING:#define __STDC_HOSTED__ 0
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++1z -E -dM < /dev/null | FileCheck -check-prefix GXX1Z %s
+// RUN: %clang_cc1 -x c++ -std=gnu++1z -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX1Z %s
 //
-// GXX1Z:#define __GNUG__
+// GXX1Z:#define __GNUG__ {{.*}}
 // GXX1Z:#define __GXX_WEAK__ 1
 // GXX1Z:#define __cplusplus 201406L
 // GXX1Z:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++1y -E -dM < /dev/null | FileCheck -check-prefix GXX1Y %s
+// RUN: %clang_cc1 -x c++ -std=gnu++1y -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX1Y %s
 //
-// GXX1Y:#define __GNUG__
+// GXX1Y:#define __GNUG__ {{.*}}
 // GXX1Y:#define __GXX_WEAK__ 1
 // GXX1Y:#define __cplusplus 201402L
 // GXX1Y:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++11 -E -dM < /dev/null | FileCheck -check-prefix GXX11 %s
+// RUN: %clang_cc1 -x c++ -std=gnu++11 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX11 %s
 //
-// GXX11:#define __GNUG__
+// GXX11:#define __GNUG__ {{.*}}
 // GXX11:#define __GXX_WEAK__ 1
 // GXX11:#define __cplusplus 201103L
 // GXX11:#define __private_extern__ extern
 //
 //
-// RUN: %clang_cc1 -x c++ -std=gnu++98 -E -dM < /dev/null | FileCheck -check-prefix GXX98 %s
+// RUN: %clang_cc1 -x c++ -std=gnu++98 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GXX98 %s
 //
-// GXX98:#define __GNUG__
+// GXX98:#define __GNUG__ {{.*}}
 // GXX98:#define __GXX_WEAK__ 1
 // GXX98:#define __cplusplus 199711L
 // GXX98:#define __private_extern__ extern
 //
 // 
-// RUN: %clang_cc1 -std=iso9899:199409 -E -dM < /dev/null | FileCheck -check-prefix C94 %s
+// RUN: %clang_cc1 -std=iso9899:199409 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix C94 %s
 //
 // C94:#define __STDC_VERSION__ 199409L
 //
 // 
-// RUN: %clang_cc1 -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT %s
 //
 // MSEXT-NOT:#define __STDC__
 // MSEXT:#define _INTEGRAL_MAX_BITS 64
@@ -140,100 +153,100 @@
 // MSEXT-NOT:#define _WCHAR_T_DEFINED 1
 //
 //
-// RUN: %clang_cc1 -x c++ -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT-CXX %s
+// RUN: %clang_cc1 -x c++ -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT-CXX %s
 //
 // MSEXT-CXX:#define _NATIVE_WCHAR_T_DEFINED 1
 // MSEXT-CXX:#define _WCHAR_T_DEFINED 1
 // MSEXT-CXX:#define __BOOL_DEFINED 1
 //
 //
-// RUN: %clang_cc1 -x c++ -fno-wchar -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -check-prefix MSEXT-CXX-NOWCHAR %s
+// RUN: %clang_cc1 -x c++ -fno-wchar -fms-extensions -triple i686-pc-win32 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MSEXT-CXX-NOWCHAR %s
 //
 // MSEXT-CXX-NOWCHAR-NOT:#define _NATIVE_WCHAR_T_DEFINED 1
 // MSEXT-CXX-NOWCHAR-NOT:#define _WCHAR_T_DEFINED 1
 // MSEXT-CXX-NOWCHAR:#define __BOOL_DEFINED 1
 //
 // 
-// RUN: %clang_cc1 -x objective-c -E -dM < /dev/null | FileCheck -check-prefix OBJC %s
+// RUN: %clang_cc1 -x objective-c -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix OBJC %s
 //
 // OBJC:#define OBJC_NEW_PROPERTIES 1
 // OBJC:#define __NEXT_RUNTIME__ 1
 // OBJC:#define __OBJC__ 1
 //
 //
-// RUN: %clang_cc1 -x objective-c -fobjc-gc -E -dM < /dev/null | FileCheck -check-prefix OBJCGC %s
+// RUN: %clang_cc1 -x objective-c -fobjc-gc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix OBJCGC %s
 //
 // OBJCGC:#define __OBJC_GC__ 1
 //
 // 
-// RUN: %clang_cc1 -x objective-c -fobjc-exceptions -E -dM < /dev/null | FileCheck -check-prefix NONFRAGILE %s
+// RUN: %clang_cc1 -x objective-c -fobjc-exceptions -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NONFRAGILE %s
 //
 // NONFRAGILE:#define OBJC_ZEROCOST_EXCEPTIONS 1
 // NONFRAGILE:#define __OBJC2__ 1
 //
 //
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix O0 %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix O0 %s
 //
 // O0:#define __NO_INLINE__ 1
 // O0-NOT:#define __OPTIMIZE_SIZE__
 // O0-NOT:#define __OPTIMIZE__
 //
 //
-// RUN: %clang_cc1 -fno-inline -O3 -E -dM < /dev/null | FileCheck -check-prefix NO_INLINE %s
+// RUN: %clang_cc1 -fno-inline -O3 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NO_INLINE %s
 //
 // NO_INLINE:#define __NO_INLINE__ 1
 // NO_INLINE-NOT:#define __OPTIMIZE_SIZE__
-// NO_INLINE:#define __OPTIMIZE__
+// NO_INLINE:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -O1 -E -dM < /dev/null | FileCheck -check-prefix O1 %s
+// RUN: %clang_cc1 -O1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix O1 %s
 //
 // O1-NOT:#define __OPTIMIZE_SIZE__
 // O1:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -Os -E -dM < /dev/null | FileCheck -check-prefix Os %s
+// RUN: %clang_cc1 -Os -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix Os %s
 //
 // Os:#define __OPTIMIZE_SIZE__ 1
 // Os:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -Oz -E -dM < /dev/null | FileCheck -check-prefix Oz %s
+// RUN: %clang_cc1 -Oz -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix Oz %s
 //
 // Oz:#define __OPTIMIZE_SIZE__ 1
 // Oz:#define __OPTIMIZE__ 1
 //
 //
-// RUN: %clang_cc1 -fpascal-strings -E -dM < /dev/null | FileCheck -check-prefix PASCAL %s
+// RUN: %clang_cc1 -fpascal-strings -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix PASCAL %s
 //
 // PASCAL:#define __PASCAL_STRINGS__ 1
 //
 // 
-// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -check-prefix SCHAR %s
+// RUN: %clang_cc1 -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix SCHAR %s
 // 
 // SCHAR:#define __STDC__ 1
 // SCHAR-NOT:#define __UNSIGNED_CHAR__
 // SCHAR:#define __clang__ 1
 //
-// RUN: %clang_cc1 -E -dM -fshort-wchar < /dev/null | FileCheck -check-prefix SHORTWCHAR %s
+// RUN: %clang_cc1 -E -dM -fshort-wchar < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR %s
 // wchar_t is u16 for targeting Win32.
 // FIXME: Implement and check x86_64-cygwin.
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-w64-mingw32 < /dev/null | FileCheck -check-prefix SHORTWCHAR %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-w64-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR %s
 //
 // SHORTWCHAR: #define __SIZEOF_WCHAR_T__ 2
 // SHORTWCHAR: #define __WCHAR_MAX__ 65535
 // SHORTWCHAR: #define __WCHAR_TYPE__ unsigned short
 // SHORTWCHAR: #define __WCHAR_WIDTH__ 16
 //
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=i686-unknown-unknown < /dev/null | FileCheck -check-prefix SHORTWCHAR2 %s
-// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-unknown-unknown < /dev/null | FileCheck -check-prefix SHORTWCHAR2 %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=i686-unknown-unknown < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR2 %s
+// RUN: %clang_cc1 -E -dM -fno-short-wchar -triple=x86_64-unknown-unknown < /dev/null | FileCheck -match-full-lines -check-prefix SHORTWCHAR2 %s
 //
 // SHORTWCHAR2: #define __SIZEOF_WCHAR_T__ 4
 // SHORTWCHAR2: #define __WCHAR_WIDTH__ 32
 // Other definitions vary from platform to platform
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-none < /dev/null | FileCheck -check-prefix AARCH64 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -check-prefix AARCH64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64 %s
 //
 // AARCH64:#define _LP64 1
 // AARCH64-NOT:#define __AARCH64EB__ 1
@@ -277,12 +290,12 @@
 // AARCH64:#define __FLT_MIN_EXP__ (-125)
 // AARCH64:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64:#define __FLT_RADIX__ 2
-// AARCH64:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT16_C_SUFFIX__
 // AARCH64:#define __INT16_FMTd__ "hd"
 // AARCH64:#define __INT16_FMTi__ "hi"
 // AARCH64:#define __INT16_MAX__ 32767
 // AARCH64:#define __INT16_TYPE__ short
-// AARCH64:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT32_C_SUFFIX__
 // AARCH64:#define __INT32_FMTd__ "d"
 // AARCH64:#define __INT32_FMTi__ "i"
 // AARCH64:#define __INT32_MAX__ 2147483647
@@ -292,7 +305,7 @@
 // AARCH64:#define __INT64_FMTi__ "li"
 // AARCH64:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64:#define __INT64_TYPE__ long int
-// AARCH64:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64:#define __INT8_C_SUFFIX__
 // AARCH64:#define __INT8_FMTd__ "hhd"
 // AARCH64:#define __INT8_FMTi__ "hhi"
 // AARCH64:#define __INT8_MAX__ 127
@@ -380,7 +393,7 @@
 // AARCH64:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64:#define __SIZE_TYPE__ long unsigned int
 // AARCH64:#define __SIZE_WIDTH__ 64
-// AARCH64:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64:#define __UINT16_C_SUFFIX__
 // AARCH64:#define __UINT16_MAX__ 65535
 // AARCH64:#define __UINT16_TYPE__ unsigned short
 // AARCH64:#define __UINT32_C_SUFFIX__ U
@@ -389,7 +402,7 @@
 // AARCH64:#define __UINT64_C_SUFFIX__ UL
 // AARCH64:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64:#define __UINT64_TYPE__ long unsigned int
-// AARCH64:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64:#define __UINT8_C_SUFFIX__
 // AARCH64:#define __UINT8_MAX__ 255
 // AARCH64:#define __UINT8_TYPE__ unsigned char
 // AARCH64:#define __UINTMAX_C_SUFFIX__ UL
@@ -415,7 +428,7 @@
 // AARCH64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // AARCH64:#define __UINT_LEAST8_MAX__ 255
 // AARCH64:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64:#define __USER_LABEL_PREFIX__ _
+// AARCH64:#define __USER_LABEL_PREFIX__
 // AARCH64:#define __WCHAR_MAX__ 4294967295U
 // AARCH64:#define __WCHAR_TYPE__ unsigned int
 // AARCH64:#define __WCHAR_UNSIGNED__ 1
@@ -424,7 +437,7 @@
 // AARCH64:#define __WINT_WIDTH__ 32
 // AARCH64:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64_be-none-none < /dev/null | FileCheck -check-prefix AARCH64-BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64_be-none-none < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-BE %s
 //
 // AARCH64-BE:#define _LP64 1
 // AARCH64-BE:#define __AARCH64EB__ 1
@@ -468,12 +481,12 @@
 // AARCH64-BE:#define __FLT_MIN_EXP__ (-125)
 // AARCH64-BE:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-BE:#define __FLT_RADIX__ 2
-// AARCH64-BE:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT16_C_SUFFIX__
 // AARCH64-BE:#define __INT16_FMTd__ "hd"
 // AARCH64-BE:#define __INT16_FMTi__ "hi"
 // AARCH64-BE:#define __INT16_MAX__ 32767
 // AARCH64-BE:#define __INT16_TYPE__ short
-// AARCH64-BE:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT32_C_SUFFIX__
 // AARCH64-BE:#define __INT32_FMTd__ "d"
 // AARCH64-BE:#define __INT32_FMTi__ "i"
 // AARCH64-BE:#define __INT32_MAX__ 2147483647
@@ -483,7 +496,7 @@
 // AARCH64-BE:#define __INT64_FMTi__ "li"
 // AARCH64-BE:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64-BE:#define __INT64_TYPE__ long int
-// AARCH64-BE:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __INT8_C_SUFFIX__
 // AARCH64-BE:#define __INT8_FMTd__ "hhd"
 // AARCH64-BE:#define __INT8_FMTi__ "hhi"
 // AARCH64-BE:#define __INT8_MAX__ 127
@@ -571,7 +584,7 @@
 // AARCH64-BE:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-BE:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-BE:#define __SIZE_WIDTH__ 64
-// AARCH64-BE:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __UINT16_C_SUFFIX__
 // AARCH64-BE:#define __UINT16_MAX__ 65535
 // AARCH64-BE:#define __UINT16_TYPE__ unsigned short
 // AARCH64-BE:#define __UINT32_C_SUFFIX__ U
@@ -580,7 +593,7 @@
 // AARCH64-BE:#define __UINT64_C_SUFFIX__ UL
 // AARCH64-BE:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64-BE:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-BE:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-BE:#define __UINT8_C_SUFFIX__
 // AARCH64-BE:#define __UINT8_MAX__ 255
 // AARCH64-BE:#define __UINT8_TYPE__ unsigned char
 // AARCH64-BE:#define __UINTMAX_C_SUFFIX__ UL
@@ -606,7 +619,7 @@
 // AARCH64-BE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // AARCH64-BE:#define __UINT_LEAST8_MAX__ 255
 // AARCH64-BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// AARCH64-BE:#define __USER_LABEL_PREFIX__ _
+// AARCH64-BE:#define __USER_LABEL_PREFIX__
 // AARCH64-BE:#define __WCHAR_MAX__ 4294967295U
 // AARCH64-BE:#define __WCHAR_TYPE__ unsigned int
 // AARCH64-BE:#define __WCHAR_UNSIGNED__ 1
@@ -615,7 +628,7 @@
 // AARCH64-BE:#define __WINT_WIDTH__ 32
 // AARCH64-BE:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-netbsd < /dev/null | FileCheck -check-prefix AARCH64-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-NETBSD %s
 //
 // AARCH64-NETBSD:#define _LP64 1
 // AARCH64-NETBSD-NOT:#define __AARCH64EB__ 1
@@ -660,12 +673,12 @@
 // AARCH64-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // AARCH64-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-NETBSD:#define __FLT_RADIX__ 2
-// AARCH64-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT16_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT16_FMTd__ "hd"
 // AARCH64-NETBSD:#define __INT16_FMTi__ "hi"
 // AARCH64-NETBSD:#define __INT16_MAX__ 32767
 // AARCH64-NETBSD:#define __INT16_TYPE__ short
-// AARCH64-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT32_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT32_FMTd__ "d"
 // AARCH64-NETBSD:#define __INT32_FMTi__ "i"
 // AARCH64-NETBSD:#define __INT32_MAX__ 2147483647
@@ -673,9 +686,9 @@
 // AARCH64-NETBSD:#define __INT64_C_SUFFIX__ LL
 // AARCH64-NETBSD:#define __INT64_FMTd__ "lld"
 // AARCH64-NETBSD:#define __INT64_FMTi__ "lli"
-// AARCH64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
+// AARCH64-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // AARCH64-NETBSD:#define __INT64_TYPE__ long long int
-// AARCH64-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __INT8_C_SUFFIX__
 // AARCH64-NETBSD:#define __INT8_FMTd__ "hhd"
 // AARCH64-NETBSD:#define __INT8_FMTi__ "hhi"
 // AARCH64-NETBSD:#define __INT8_MAX__ 127
@@ -764,7 +777,7 @@
 // AARCH64-NETBSD:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-NETBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __UINT16_C_SUFFIX__
 // AARCH64-NETBSD:#define __UINT16_MAX__ 65535
 // AARCH64-NETBSD:#define __UINT16_TYPE__ unsigned short
 // AARCH64-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -773,7 +786,7 @@
 // AARCH64-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // AARCH64-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// AARCH64-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-NETBSD:#define __UINT8_C_SUFFIX__
 // AARCH64-NETBSD:#define __UINT8_MAX__ 255
 // AARCH64-NETBSD:#define __UINT8_TYPE__ unsigned char
 // AARCH64-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
@@ -807,7 +820,7 @@
 // AARCH64-NETBSD:#define __WINT_WIDTH__ 32
 // AARCH64-NETBSD:#define __aarch64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-freebsd11 < /dev/null | FileCheck -check-prefix AARCH64-FREEBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-freebsd11 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-FREEBSD %s
 //
 // AARCH64-FREEBSD:#define _LP64 1
 // AARCH64-FREEBSD-NOT:#define __AARCH64EB__ 1
@@ -853,12 +866,12 @@
 // AARCH64-FREEBSD:#define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-FREEBSD:#define __FLT_RADIX__ 2
 // AARCH64-FREEBSD:#define __FreeBSD__ 11
-// AARCH64-FREEBSD:#define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT16_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT16_FMTd__ "hd"
 // AARCH64-FREEBSD:#define __INT16_FMTi__ "hi"
 // AARCH64-FREEBSD:#define __INT16_MAX__ 32767
 // AARCH64-FREEBSD:#define __INT16_TYPE__ short
-// AARCH64-FREEBSD:#define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT32_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT32_FMTd__ "d"
 // AARCH64-FREEBSD:#define __INT32_FMTi__ "i"
 // AARCH64-FREEBSD:#define __INT32_MAX__ 2147483647
@@ -868,7 +881,7 @@
 // AARCH64-FREEBSD:#define __INT64_FMTi__ "li"
 // AARCH64-FREEBSD:#define __INT64_MAX__ 9223372036854775807L
 // AARCH64-FREEBSD:#define __INT64_TYPE__ long int
-// AARCH64-FREEBSD:#define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __INT8_C_SUFFIX__
 // AARCH64-FREEBSD:#define __INT8_FMTd__ "hhd"
 // AARCH64-FREEBSD:#define __INT8_FMTi__ "hhi"
 // AARCH64-FREEBSD:#define __INT8_MAX__ 127
@@ -957,7 +970,7 @@
 // AARCH64-FREEBSD:#define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-FREEBSD:#define __SIZE_TYPE__ long unsigned int
 // AARCH64-FREEBSD:#define __SIZE_WIDTH__ 64
-// AARCH64-FREEBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __UINT16_C_SUFFIX__
 // AARCH64-FREEBSD:#define __UINT16_MAX__ 65535
 // AARCH64-FREEBSD:#define __UINT16_TYPE__ unsigned short
 // AARCH64-FREEBSD:#define __UINT32_C_SUFFIX__ U
@@ -966,7 +979,7 @@
 // AARCH64-FREEBSD:#define __UINT64_C_SUFFIX__ UL
 // AARCH64-FREEBSD:#define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64-FREEBSD:#define __UINT64_TYPE__ long unsigned int
-// AARCH64-FREEBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-FREEBSD:#define __UINT8_C_SUFFIX__
 // AARCH64-FREEBSD:#define __UINT8_MAX__ 255
 // AARCH64-FREEBSD:#define __UINT8_TYPE__ unsigned char
 // AARCH64-FREEBSD:#define __UINTMAX_C_SUFFIX__ UL
@@ -1001,7 +1014,7 @@
 // AARCH64-FREEBSD:#define __WINT_WIDTH__ 32
 // AARCH64-FREEBSD:#define __aarch64__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -check-prefix AARCH64-DARWIN %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-apple-ios7.0 < /dev/null | FileCheck -match-full-lines -check-prefix AARCH64-DARWIN %s
 //
 // AARCH64-DARWIN: #define _LP64 1
 // AARCH64-NOT: #define __AARCH64EB__ 1
@@ -1045,22 +1058,22 @@
 // AARCH64-DARWIN: #define __FLT_MIN_EXP__ (-125)
 // AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-DARWIN: #define __FLT_RADIX__ 2
-// AARCH64-DARWIN: #define __INT16_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
 // AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
 // AARCH64-DARWIN: #define __INT16_MAX__ 32767
 // AARCH64-DARWIN: #define __INT16_TYPE__ short
-// AARCH64-DARWIN: #define __INT32_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT32_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT32_FMTd__ "d"
 // AARCH64-DARWIN: #define __INT32_FMTi__ "i"
 // AARCH64-DARWIN: #define __INT32_MAX__ 2147483647
 // AARCH64-DARWIN: #define __INT32_TYPE__ int
-// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ L
+// AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL
 // AARCH64-DARWIN: #define __INT64_FMTd__ "lld"
 // AARCH64-DARWIN: #define __INT64_FMTi__ "lli"
-// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807L
+// AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL
 // AARCH64-DARWIN: #define __INT64_TYPE__ long long int
-// AARCH64-DARWIN: #define __INT8_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __INT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT8_FMTd__ "hhd"
 // AARCH64-DARWIN: #define __INT8_FMTi__ "hhi"
 // AARCH64-DARWIN: #define __INT8_MAX__ 127
@@ -1148,16 +1161,16 @@
 // AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int
 // AARCH64-DARWIN: #define __SIZE_WIDTH__ 64
-// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __UINT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT16_MAX__ 65535
 // AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short
 // AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U
 // AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U
 // AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int
-// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ UL
-// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615UL
+// AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL
+// AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int
-// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__ {{$}}
+// AARCH64-DARWIN: #define __UINT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT8_MAX__ 255
 // AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char
 // AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL
@@ -1192,7 +1205,7 @@
 // AARCH64-DARWIN: #define __WINT_WIDTH__ 32
 // AARCH64-DARWIN: #define __aarch64__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -check-prefix ARM %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM %s
 //
 // ARM-NOT:#define _LP64
 // ARM:#define __APCS_32__ 1
@@ -1234,12 +1247,12 @@
 // ARM:#define __FLT_MIN_EXP__ (-125)
 // ARM:#define __FLT_MIN__ 1.17549435e-38F
 // ARM:#define __FLT_RADIX__ 2
-// ARM:#define __INT16_C_SUFFIX__ {{$}}
+// ARM:#define __INT16_C_SUFFIX__
 // ARM:#define __INT16_FMTd__ "hd"
 // ARM:#define __INT16_FMTi__ "hi"
 // ARM:#define __INT16_MAX__ 32767
 // ARM:#define __INT16_TYPE__ short
-// ARM:#define __INT32_C_SUFFIX__ {{$}}
+// ARM:#define __INT32_C_SUFFIX__
 // ARM:#define __INT32_FMTd__ "d"
 // ARM:#define __INT32_FMTi__ "i"
 // ARM:#define __INT32_MAX__ 2147483647
@@ -1249,7 +1262,7 @@
 // ARM:#define __INT64_FMTi__ "lli"
 // ARM:#define __INT64_MAX__ 9223372036854775807LL
 // ARM:#define __INT64_TYPE__ long long int
-// ARM:#define __INT8_C_SUFFIX__ {{$}}
+// ARM:#define __INT8_C_SUFFIX__
 // ARM:#define __INT8_FMTd__ "hhd"
 // ARM:#define __INT8_FMTi__ "hhi"
 // ARM:#define __INT8_MAX__ 127
@@ -1338,7 +1351,7 @@
 // ARM:#define __SIZE_MAX__ 4294967295U
 // ARM:#define __SIZE_TYPE__ unsigned int
 // ARM:#define __SIZE_WIDTH__ 32
-// ARM:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM:#define __UINT16_C_SUFFIX__
 // ARM:#define __UINT16_MAX__ 65535
 // ARM:#define __UINT16_TYPE__ unsigned short
 // ARM:#define __UINT32_C_SUFFIX__ U
@@ -1347,14 +1360,14 @@
 // ARM:#define __UINT64_C_SUFFIX__ ULL
 // ARM:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM:#define __UINT64_TYPE__ long long unsigned int
-// ARM:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM:#define __UINT8_C_SUFFIX__
 // ARM:#define __UINT8_MAX__ 255
 // ARM:#define __UINT8_TYPE__ unsigned char
 // ARM:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM:#define __UINTMAX_WIDTH__ 64
-// ARM:#define __UINTPTR_MAX__ 4294967295U
+// ARM:#define __UINTPTR_MAX__ 4294967295UL
 // ARM:#define __UINTPTR_TYPE__ long unsigned int
 // ARM:#define __UINTPTR_WIDTH__ 32
 // ARM:#define __UINT_FAST16_MAX__ 65535
@@ -1373,7 +1386,7 @@
 // ARM:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // ARM:#define __UINT_LEAST8_MAX__ 255
 // ARM:#define __UINT_LEAST8_TYPE__ unsigned char
-// ARM:#define __USER_LABEL_PREFIX__ _
+// ARM:#define __USER_LABEL_PREFIX__
 // ARM:#define __WCHAR_MAX__ 4294967295U
 // ARM:#define __WCHAR_TYPE__ unsigned int
 // ARM:#define __WCHAR_WIDTH__ 32
@@ -1382,7 +1395,7 @@
 // ARM:#define __arm 1
 // ARM:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armeb-none-none < /dev/null | FileCheck -check-prefix ARM-BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armeb-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARM-BE %s
 //
 // ARM-BE-NOT:#define _LP64
 // ARM-BE:#define __APCS_32__ 1
@@ -1425,12 +1438,12 @@
 // ARM-BE:#define __FLT_MIN_EXP__ (-125)
 // ARM-BE:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-BE:#define __FLT_RADIX__ 2
-// ARM-BE:#define __INT16_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT16_C_SUFFIX__
 // ARM-BE:#define __INT16_FMTd__ "hd"
 // ARM-BE:#define __INT16_FMTi__ "hi"
 // ARM-BE:#define __INT16_MAX__ 32767
 // ARM-BE:#define __INT16_TYPE__ short
-// ARM-BE:#define __INT32_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT32_C_SUFFIX__
 // ARM-BE:#define __INT32_FMTd__ "d"
 // ARM-BE:#define __INT32_FMTi__ "i"
 // ARM-BE:#define __INT32_MAX__ 2147483647
@@ -1440,7 +1453,7 @@
 // ARM-BE:#define __INT64_FMTi__ "lli"
 // ARM-BE:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-BE:#define __INT64_TYPE__ long long int
-// ARM-BE:#define __INT8_C_SUFFIX__ {{$}}
+// ARM-BE:#define __INT8_C_SUFFIX__
 // ARM-BE:#define __INT8_FMTd__ "hhd"
 // ARM-BE:#define __INT8_FMTi__ "hhi"
 // ARM-BE:#define __INT8_MAX__ 127
@@ -1528,7 +1541,7 @@
 // ARM-BE:#define __SIZE_MAX__ 4294967295U
 // ARM-BE:#define __SIZE_TYPE__ unsigned int
 // ARM-BE:#define __SIZE_WIDTH__ 32
-// ARM-BE:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM-BE:#define __UINT16_C_SUFFIX__
 // ARM-BE:#define __UINT16_MAX__ 65535
 // ARM-BE:#define __UINT16_TYPE__ unsigned short
 // ARM-BE:#define __UINT32_C_SUFFIX__ U
@@ -1537,14 +1550,14 @@
 // ARM-BE:#define __UINT64_C_SUFFIX__ ULL
 // ARM-BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINT64_TYPE__ long long unsigned int
-// ARM-BE:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM-BE:#define __UINT8_C_SUFFIX__
 // ARM-BE:#define __UINT8_MAX__ 255
 // ARM-BE:#define __UINT8_TYPE__ unsigned char
 // ARM-BE:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM-BE:#define __UINTMAX_WIDTH__ 64
-// ARM-BE:#define __UINTPTR_MAX__ 4294967295U
+// ARM-BE:#define __UINTPTR_MAX__ 4294967295UL
 // ARM-BE:#define __UINTPTR_TYPE__ long unsigned int
 // ARM-BE:#define __UINTPTR_WIDTH__ 32
 // ARM-BE:#define __UINT_FAST16_MAX__ 65535
@@ -1563,7 +1576,7 @@
 // ARM-BE:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // ARM-BE:#define __UINT_LEAST8_MAX__ 255
 // ARM-BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// ARM-BE:#define __USER_LABEL_PREFIX__ _
+// ARM-BE:#define __USER_LABEL_PREFIX__
 // ARM-BE:#define __WCHAR_MAX__ 4294967295U
 // ARM-BE:#define __WCHAR_TYPE__ unsigned int
 // ARM-BE:#define __WCHAR_WIDTH__ 32
@@ -1572,7 +1585,7 @@
 // ARM-BE:#define __arm 1
 // ARM-BE:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi -target-feature +soft-float -target-feature +soft-float-abi < /dev/null | FileCheck -check-prefix ARMEABISOFTFP %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi -target-feature +soft-float -target-feature +soft-float-abi < /dev/null | FileCheck -match-full-lines -check-prefix ARMEABISOFTFP %s
 //
 // ARMEABISOFTFP-NOT:#define _LP64
 // ARMEABISOFTFP:#define __APCS_32__ 1
@@ -1618,12 +1631,12 @@
 // ARMEABISOFTFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABISOFTFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABISOFTFP:#define __FLT_RADIX__ 2
-// ARMEABISOFTFP:#define __INT16_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT16_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT16_FMTd__ "hd"
 // ARMEABISOFTFP:#define __INT16_FMTi__ "hi"
 // ARMEABISOFTFP:#define __INT16_MAX__ 32767
 // ARMEABISOFTFP:#define __INT16_TYPE__ short
-// ARMEABISOFTFP:#define __INT32_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT32_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT32_FMTd__ "d"
 // ARMEABISOFTFP:#define __INT32_FMTi__ "i"
 // ARMEABISOFTFP:#define __INT32_MAX__ 2147483647
@@ -1633,7 +1646,7 @@
 // ARMEABISOFTFP:#define __INT64_FMTi__ "lli"
 // ARMEABISOFTFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABISOFTFP:#define __INT64_TYPE__ long long int
-// ARMEABISOFTFP:#define __INT8_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __INT8_C_SUFFIX__
 // ARMEABISOFTFP:#define __INT8_FMTd__ "hhd"
 // ARMEABISOFTFP:#define __INT8_FMTi__ "hhi"
 // ARMEABISOFTFP:#define __INT8_MAX__ 127
@@ -1723,7 +1736,7 @@
 // ARMEABISOFTFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABISOFTFP:#define __SIZE_WIDTH__ 32
 // ARMEABISOFTFP:#define __SOFTFP__ 1
-// ARMEABISOFTFP:#define __UINT16_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __UINT16_C_SUFFIX__
 // ARMEABISOFTFP:#define __UINT16_MAX__ 65535
 // ARMEABISOFTFP:#define __UINT16_TYPE__ unsigned short
 // ARMEABISOFTFP:#define __UINT32_C_SUFFIX__ U
@@ -1732,14 +1745,14 @@
 // ARMEABISOFTFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABISOFTFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP:#define __UINT64_TYPE__ long long unsigned int
-// ARMEABISOFTFP:#define __UINT8_C_SUFFIX__ {{$}}
+// ARMEABISOFTFP:#define __UINT8_C_SUFFIX__
 // ARMEABISOFTFP:#define __UINT8_MAX__ 255
 // ARMEABISOFTFP:#define __UINT8_TYPE__ unsigned char
 // ARMEABISOFTFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABISOFTFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP:#define __UINTMAX_TYPE__ long long unsigned int
 // ARMEABISOFTFP:#define __UINTMAX_WIDTH__ 64
-// ARMEABISOFTFP:#define __UINTPTR_MAX__ 4294967295U
+// ARMEABISOFTFP:#define __UINTPTR_MAX__ 4294967295UL
 // ARMEABISOFTFP:#define __UINTPTR_TYPE__ long unsigned int
 // ARMEABISOFTFP:#define __UINTPTR_WIDTH__ 32
 // ARMEABISOFTFP:#define __UINT_FAST16_MAX__ 65535
@@ -1767,7 +1780,7 @@
 // ARMEABISOFTFP:#define __arm 1
 // ARMEABISOFTFP:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi < /dev/null | FileCheck -check-prefix ARMEABIHARDFP %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-linux-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix ARMEABIHARDFP %s
 //
 // ARMEABIHARDFP-NOT:#define _LP64
 // ARMEABIHARDFP:#define __APCS_32__ 1
@@ -1813,12 +1826,12 @@
 // ARMEABIHARDFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABIHARDFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABIHARDFP:#define __FLT_RADIX__ 2
-// ARMEABIHARDFP:#define __INT16_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT16_FMTd__ "hd"
 // ARMEABIHARDFP:#define __INT16_FMTi__ "hi"
 // ARMEABIHARDFP:#define __INT16_MAX__ 32767
 // ARMEABIHARDFP:#define __INT16_TYPE__ short
-// ARMEABIHARDFP:#define __INT32_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT32_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT32_FMTd__ "d"
 // ARMEABIHARDFP:#define __INT32_FMTi__ "i"
 // ARMEABIHARDFP:#define __INT32_MAX__ 2147483647
@@ -1828,7 +1841,7 @@
 // ARMEABIHARDFP:#define __INT64_FMTi__ "lli"
 // ARMEABIHARDFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABIHARDFP:#define __INT64_TYPE__ long long int
-// ARMEABIHARDFP:#define __INT8_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __INT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT8_FMTd__ "hhd"
 // ARMEABIHARDFP:#define __INT8_FMTi__ "hhi"
 // ARMEABIHARDFP:#define __INT8_MAX__ 127
@@ -1918,7 +1931,7 @@
 // ARMEABIHARDFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABIHARDFP:#define __SIZE_WIDTH__ 32
 // ARMEABIHARDFP-NOT:#define __SOFTFP__ 1
-// ARMEABIHARDFP:#define __UINT16_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __UINT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT16_MAX__ 65535
 // ARMEABIHARDFP:#define __UINT16_TYPE__ unsigned short
 // ARMEABIHARDFP:#define __UINT32_C_SUFFIX__ U
@@ -1927,14 +1940,14 @@
 // ARMEABIHARDFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINT64_TYPE__ long long unsigned int
-// ARMEABIHARDFP:#define __UINT8_C_SUFFIX__ {{$}}
+// ARMEABIHARDFP:#define __UINT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT8_MAX__ 255
 // ARMEABIHARDFP:#define __UINT8_TYPE__ unsigned char
 // ARMEABIHARDFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINTMAX_TYPE__ long long unsigned int
 // ARMEABIHARDFP:#define __UINTMAX_WIDTH__ 64
-// ARMEABIHARDFP:#define __UINTPTR_MAX__ 4294967295U
+// ARMEABIHARDFP:#define __UINTPTR_MAX__ 4294967295UL
 // ARMEABIHARDFP:#define __UINTPTR_TYPE__ long unsigned int
 // ARMEABIHARDFP:#define __UINTPTR_WIDTH__ 32
 // ARMEABIHARDFP:#define __UINT_FAST16_MAX__ 65535
@@ -1962,7 +1975,7 @@
 // ARMEABIHARDFP:#define __arm 1
 // ARMEABIHARDFP:#define __arm__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-netbsd-eabi < /dev/null | FileCheck -check-prefix ARM-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-netbsd-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NETBSD %s
 //
 // ARM-NETBSD-NOT:#define _LP64
 // ARM-NETBSD:#define __APCS_32__ 1
@@ -2006,12 +2019,12 @@
 // ARM-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // ARM-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-NETBSD:#define __FLT_RADIX__ 2
-// ARM-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT16_C_SUFFIX__
 // ARM-NETBSD:#define __INT16_FMTd__ "hd"
 // ARM-NETBSD:#define __INT16_FMTi__ "hi"
 // ARM-NETBSD:#define __INT16_MAX__ 32767
 // ARM-NETBSD:#define __INT16_TYPE__ short
-// ARM-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT32_C_SUFFIX__
 // ARM-NETBSD:#define __INT32_FMTd__ "d"
 // ARM-NETBSD:#define __INT32_FMTi__ "i"
 // ARM-NETBSD:#define __INT32_MAX__ 2147483647
@@ -2021,7 +2034,7 @@
 // ARM-NETBSD:#define __INT64_FMTi__ "lli"
 // ARM-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-NETBSD:#define __INT64_TYPE__ long long int
-// ARM-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __INT8_C_SUFFIX__
 // ARM-NETBSD:#define __INT8_FMTd__ "hhd"
 // ARM-NETBSD:#define __INT8_FMTi__ "hhi"
 // ARM-NETBSD:#define __INT8_MAX__ 127
@@ -2107,10 +2120,10 @@
 // ARM-NETBSD:#define __SIZEOF_SIZE_T__ 4
 // ARM-NETBSD:#define __SIZEOF_WCHAR_T__ 4
 // ARM-NETBSD:#define __SIZEOF_WINT_T__ 4
-// ARM-NETBSD:#define __SIZE_MAX__ 4294967295U
+// ARM-NETBSD:#define __SIZE_MAX__ 4294967295UL
 // ARM-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // ARM-NETBSD:#define __SIZE_WIDTH__ 32
-// ARM-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __UINT16_C_SUFFIX__
 // ARM-NETBSD:#define __UINT16_MAX__ 65535
 // ARM-NETBSD:#define __UINT16_TYPE__ unsigned short
 // ARM-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -2119,14 +2132,14 @@
 // ARM-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// ARM-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// ARM-NETBSD:#define __UINT8_C_SUFFIX__
 // ARM-NETBSD:#define __UINT8_MAX__ 255
 // ARM-NETBSD:#define __UINT8_TYPE__ unsigned char
-// ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
+// ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int
 // ARM-NETBSD:#define __UINTMAX_WIDTH__ 64
-// ARM-NETBSD:#define __UINTPTR_MAX__ 4294967295U
+// ARM-NETBSD:#define __UINTPTR_MAX__ 4294967295UL
 // ARM-NETBSD:#define __UINTPTR_TYPE__ long unsigned int
 // ARM-NETBSD:#define __UINTPTR_WIDTH__ 32
 // ARM-NETBSD:#define __UINT_FAST16_MAX__ 65535
@@ -2154,75 +2167,78 @@
 // ARM-NETBSD:#define __arm 1
 // ARM-NETBSD:#define __arm__ 1
 
-// RUN: %clang -target arm-apple-darwin-eabi -arch armv7s -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-DARWIN-NO-EABI %s
-// RUN: %clang -target arm-apple-darwin-eabi -arch armv6m -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-DARWIN-EABI %s
-// RUN: %clang -target arm-apple-darwin-eabi -arch armv7m -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-DARWIN-EABI %s
-// RUN: %clang -target arm-apple-darwin-eabi -arch armv7em -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-DARWIN-EABI %s
-// RUN: %clang -target thumbv7-apple-darwin-eabi -arch armv7 -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-DARWIN-NO-EABI %s
-// ARM-DARWIN-NO-EABI-NOT: #define __ARM_EABI__ 1
-// ARM-DARWIN-EABI: #define __ARM_EABI__ 1
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
+// ARM-NONE-EABI: #define __ELF__ 1
+
+// No MachO targets use the full EABI, even if AAPCS is used.
+// RUN: %clang -target x86_64-apple-darwin -arch armv7s -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv6m -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7m -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7em -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// RUN: %clang -target x86_64-apple-darwin -arch armv7 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-MACHO-NO-EABI %s
+// ARM-MACHO-NO-EABI-NOT: #define __ARM_EABI__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv7-bitrig-gnueabihf < /dev/null | FileCheck -check-prefix ARM-BITRIG %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv7-bitrig-gnueabihf < /dev/null | FileCheck -match-full-lines -check-prefix ARM-BITRIG %s
 // ARM-BITRIG:#define __ARM_DWARF_EH__ 1
 // ARM-BITRIG:#define __SIZEOF_SIZE_T__ 4
-// ARM-BITRIG:#define __SIZE_MAX__ 4294967295U
+// ARM-BITRIG:#define __SIZE_MAX__ 4294967295UL
 // ARM-BITRIG:#define __SIZE_TYPE__ long unsigned int
 // ARM-BITRIG:#define __SIZE_WIDTH__ 32
 
 // Check that -mhwdiv works properly for targets which don't have the hwdiv feature enabled by default.
 
-// RUN: %clang -target arm -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-ARM %s
+// RUN: %clang -target arm -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMHWDIV-ARM %s
 // ARMHWDIV-ARM:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target arm -mthumb -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-THUMB %s
+// RUN: %clang -target arm -mthumb -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMBHWDIV-THUMB %s
 // THUMBHWDIV-THUMB:#define __ARM_ARCH_EXT_IDIV__ 1
 
-// RUN: %clang -target arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARM-FALSE %s
+// RUN: %clang -target arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARM-FALSE %s
 // ARM-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mthumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMB-FALSE %s
+// RUN: %clang -target arm -mthumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMB-FALSE %s
 // THUMB-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck --check-prefix=THUMBHWDIV-ARM-FALSE %s
+// RUN: %clang -target arm -mhwdiv=thumb -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=THUMBHWDIV-ARM-FALSE %s
 // THUMBHWDIV-ARM-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang -target arm -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck --check-prefix=ARMHWDIV-THUMB-FALSE %s
+// RUN: %clang -target arm -mthumb -mhwdiv=arm -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=ARMHWDIV-THUMB-FALSE %s
 // ARMHWDIV-THUMB-FALSE-NOT:#define __ARM_ARCH_EXT_IDIV__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv8-none-none < /dev/null | FileCheck -check-prefix ARMv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armv8-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARMv8 %s
 // ARMv8: #define __THUMB_INTERWORK__ 1
 // ARMv8-NOT: #define __thumb2__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armebv8-none-none < /dev/null | FileCheck -check-prefix ARMebv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=armebv8-none-none < /dev/null | FileCheck -match-full-lines -check-prefix ARMebv8 %s
 // ARMebv8: #define __THUMB_INTERWORK__ 1
 // ARMebv8-NOT: #define __thumb2__
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv8 < /dev/null | FileCheck -check-prefix Thumbv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv8 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv8 %s
 // Thumbv8: #define __THUMB_INTERWORK__ 1
-// Thumbv8: #define __thumb2__
+// Thumbv8: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv8 < /dev/null | FileCheck -check-prefix Thumbebv8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv8 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbebv8 %s
 // Thumbebv8: #define __THUMB_INTERWORK__ 1
-// Thumbebv8: #define __thumb2__
+// Thumbebv8: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv5 < /dev/null | FileCheck -check-prefix Thumbv5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv5 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv5 %s
 // Thumbv5: #define __THUMB_INTERWORK__ 1
-// Thumbv5-NOT: #define __thumb2__
+// Thumbv5-NOT: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv6t2 < /dev/null | FileCheck -check-prefix Thumbv6t2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv6t2 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv6t2 %s
 // Thumbv6t2: #define __THUMB_INTERWORK__ 1
-// Thumbv6t2: #define __thumb2__
+// Thumbv6t2: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv7 < /dev/null | FileCheck -check-prefix Thumbv7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbv7 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbv7 %s
 // Thumbv7: #define __THUMB_INTERWORK__ 1
-// Thumbv7: #define __thumb2__
+// Thumbv7: #define __thumb2__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv7 < /dev/null | FileCheck -check-prefix Thumbebv7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=thumbebv7 < /dev/null | FileCheck -match-full-lines -check-prefix Thumbebv7 %s
 // Thumbebv7: #define __THUMB_INTERWORK__ 1
-// Thumbebv7: #define __thumb2__
+// Thumbebv7: #define __thumb2__ 1
 
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-none-none < /dev/null | FileCheck -check-prefix I386 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-none-none < /dev/null | FileCheck -match-full-lines -check-prefix I386 %s
 //
 // I386-NOT:#define _LP64
 // I386:#define __BIGGEST_ALIGNMENT__ 16
@@ -2259,12 +2275,12 @@
 // I386:#define __FLT_MIN_EXP__ (-125)
 // I386:#define __FLT_MIN__ 1.17549435e-38F
 // I386:#define __FLT_RADIX__ 2
-// I386:#define __INT16_C_SUFFIX__ {{$}}
+// I386:#define __INT16_C_SUFFIX__
 // I386:#define __INT16_FMTd__ "hd"
 // I386:#define __INT16_FMTi__ "hi"
 // I386:#define __INT16_MAX__ 32767
 // I386:#define __INT16_TYPE__ short
-// I386:#define __INT32_C_SUFFIX__ {{$}}
+// I386:#define __INT32_C_SUFFIX__
 // I386:#define __INT32_FMTd__ "d"
 // I386:#define __INT32_FMTi__ "i"
 // I386:#define __INT32_MAX__ 2147483647
@@ -2274,7 +2290,7 @@
 // I386:#define __INT64_FMTi__ "lli"
 // I386:#define __INT64_MAX__ 9223372036854775807LL
 // I386:#define __INT64_TYPE__ long long int
-// I386:#define __INT8_C_SUFFIX__ {{$}}
+// I386:#define __INT8_C_SUFFIX__
 // I386:#define __INT8_FMTd__ "hhd"
 // I386:#define __INT8_FMTi__ "hhi"
 // I386:#define __INT8_MAX__ 127
@@ -2344,7 +2360,7 @@
 // I386:#define __POINTER_WIDTH__ 32
 // I386:#define __PTRDIFF_TYPE__ int
 // I386:#define __PTRDIFF_WIDTH__ 32
-// I386:#define __REGISTER_PREFIX__ 
+// I386:#define __REGISTER_PREFIX__
 // I386:#define __SCHAR_MAX__ 127
 // I386:#define __SHRT_MAX__ 32767
 // I386:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2364,7 +2380,7 @@
 // I386:#define __SIZE_MAX__ 4294967295U
 // I386:#define __SIZE_TYPE__ unsigned int
 // I386:#define __SIZE_WIDTH__ 32
-// I386:#define __UINT16_C_SUFFIX__ {{$}}
+// I386:#define __UINT16_C_SUFFIX__
 // I386:#define __UINT16_MAX__ 65535
 // I386:#define __UINT16_TYPE__ unsigned short
 // I386:#define __UINT32_C_SUFFIX__ U
@@ -2373,7 +2389,7 @@
 // I386:#define __UINT64_C_SUFFIX__ ULL
 // I386:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386:#define __UINT64_TYPE__ long long unsigned int
-// I386:#define __UINT8_C_SUFFIX__ {{$}}
+// I386:#define __UINT8_C_SUFFIX__
 // I386:#define __UINT8_MAX__ 255
 // I386:#define __UINT8_TYPE__ unsigned char
 // I386:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2399,7 +2415,7 @@
 // I386:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // I386:#define __UINT_LEAST8_MAX__ 255
 // I386:#define __UINT_LEAST8_TYPE__ unsigned char
-// I386:#define __USER_LABEL_PREFIX__ _
+// I386:#define __USER_LABEL_PREFIX__
 // I386:#define __WCHAR_MAX__ 2147483647
 // I386:#define __WCHAR_TYPE__ int
 // I386:#define __WCHAR_WIDTH__ 32
@@ -2409,7 +2425,7 @@
 // I386:#define __i386__ 1
 // I386:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-pc-linux-gnu -target-cpu pentium4 < /dev/null | FileCheck -check-prefix I386-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-pc-linux-gnu -target-cpu pentium4 < /dev/null | FileCheck -match-full-lines -check-prefix I386-LINUX %s
 //
 // I386-LINUX-NOT:#define _LP64
 // I386-LINUX:#define __BIGGEST_ALIGNMENT__ 16
@@ -2446,12 +2462,12 @@
 // I386-LINUX:#define __FLT_MIN_EXP__ (-125)
 // I386-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // I386-LINUX:#define __FLT_RADIX__ 2
-// I386-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT16_C_SUFFIX__
 // I386-LINUX:#define __INT16_FMTd__ "hd"
 // I386-LINUX:#define __INT16_FMTi__ "hi"
 // I386-LINUX:#define __INT16_MAX__ 32767
 // I386-LINUX:#define __INT16_TYPE__ short
-// I386-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT32_C_SUFFIX__
 // I386-LINUX:#define __INT32_FMTd__ "d"
 // I386-LINUX:#define __INT32_FMTi__ "i"
 // I386-LINUX:#define __INT32_MAX__ 2147483647
@@ -2461,7 +2477,7 @@
 // I386-LINUX:#define __INT64_FMTi__ "lli"
 // I386-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // I386-LINUX:#define __INT64_TYPE__ long long int
-// I386-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __INT8_C_SUFFIX__
 // I386-LINUX:#define __INT8_FMTd__ "hhd"
 // I386-LINUX:#define __INT8_FMTi__ "hhi"
 // I386-LINUX:#define __INT8_MAX__ 127
@@ -2531,7 +2547,7 @@
 // I386-LINUX:#define __POINTER_WIDTH__ 32
 // I386-LINUX:#define __PTRDIFF_TYPE__ int
 // I386-LINUX:#define __PTRDIFF_WIDTH__ 32
-// I386-LINUX:#define __REGISTER_PREFIX__ 
+// I386-LINUX:#define __REGISTER_PREFIX__
 // I386-LINUX:#define __SCHAR_MAX__ 127
 // I386-LINUX:#define __SHRT_MAX__ 32767
 // I386-LINUX:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2551,7 +2567,7 @@
 // I386-LINUX:#define __SIZE_MAX__ 4294967295U
 // I386-LINUX:#define __SIZE_TYPE__ unsigned int
 // I386-LINUX:#define __SIZE_WIDTH__ 32
-// I386-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __UINT16_C_SUFFIX__
 // I386-LINUX:#define __UINT16_MAX__ 65535
 // I386-LINUX:#define __UINT16_TYPE__ unsigned short
 // I386-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -2560,7 +2576,7 @@
 // I386-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // I386-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-LINUX:#define __UINT64_TYPE__ long long unsigned int
-// I386-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// I386-LINUX:#define __UINT8_C_SUFFIX__
 // I386-LINUX:#define __UINT8_MAX__ 255
 // I386-LINUX:#define __UINT8_TYPE__ unsigned char
 // I386-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2596,7 +2612,7 @@
 // I386-LINUX:#define __i386__ 1
 // I386-LINUX:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd < /dev/null | FileCheck -check-prefix I386-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD %s
 //
 // I386-NETBSD-NOT:#define _LP64
 // I386-NETBSD:#define __BIGGEST_ALIGNMENT__ 16
@@ -2633,12 +2649,12 @@
 // I386-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // I386-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // I386-NETBSD:#define __FLT_RADIX__ 2
-// I386-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT16_C_SUFFIX__
 // I386-NETBSD:#define __INT16_FMTd__ "hd"
 // I386-NETBSD:#define __INT16_FMTi__ "hi"
 // I386-NETBSD:#define __INT16_MAX__ 32767
 // I386-NETBSD:#define __INT16_TYPE__ short
-// I386-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT32_C_SUFFIX__
 // I386-NETBSD:#define __INT32_FMTd__ "d"
 // I386-NETBSD:#define __INT32_FMTi__ "i"
 // I386-NETBSD:#define __INT32_MAX__ 2147483647
@@ -2648,7 +2664,7 @@
 // I386-NETBSD:#define __INT64_FMTi__ "lli"
 // I386-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // I386-NETBSD:#define __INT64_TYPE__ long long int
-// I386-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __INT8_C_SUFFIX__
 // I386-NETBSD:#define __INT8_FMTd__ "hhd"
 // I386-NETBSD:#define __INT8_FMTi__ "hhi"
 // I386-NETBSD:#define __INT8_MAX__ 127
@@ -2718,7 +2734,7 @@
 // I386-NETBSD:#define __POINTER_WIDTH__ 32
 // I386-NETBSD:#define __PTRDIFF_TYPE__ int
 // I386-NETBSD:#define __PTRDIFF_WIDTH__ 32
-// I386-NETBSD:#define __REGISTER_PREFIX__ 
+// I386-NETBSD:#define __REGISTER_PREFIX__
 // I386-NETBSD:#define __SCHAR_MAX__ 127
 // I386-NETBSD:#define __SHRT_MAX__ 32767
 // I386-NETBSD:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2738,7 +2754,7 @@
 // I386-NETBSD:#define __SIZE_MAX__ 4294967295U
 // I386-NETBSD:#define __SIZE_TYPE__ unsigned int
 // I386-NETBSD:#define __SIZE_WIDTH__ 32
-// I386-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __UINT16_C_SUFFIX__
 // I386-NETBSD:#define __UINT16_MAX__ 65535
 // I386-NETBSD:#define __UINT16_TYPE__ unsigned short
 // I386-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -2747,7 +2763,7 @@
 // I386-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // I386-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-NETBSD:#define __UINT64_TYPE__ long long unsigned int
-// I386-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// I386-NETBSD:#define __UINT8_C_SUFFIX__
 // I386-NETBSD:#define __UINT8_MAX__ 255
 // I386-NETBSD:#define __UINT8_TYPE__ unsigned char
 // I386-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
@@ -2783,21 +2799,21 @@
 // I386-NETBSD:#define __i386__ 1
 // I386-NETBSD:#define i386 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -check-prefix I386-NETBSD-SSE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD-SSE %s
 // I386-NETBSD-SSE:#define __FLT_EVAL_METHOD__ 0
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -check-prefix I386-NETBSD6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6  < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6 %s
 // I386-NETBSD6:#define __FLT_EVAL_METHOD__ 1
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -check-prefix I386-NETBSD6-SSE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i386-netbsd6 -target-feature +sse2 < /dev/null | FileCheck -match-full-lines -check-prefix I386-NETBSD6-SSE %s
 // I386-NETBSD6-SSE:#define __FLT_EVAL_METHOD__ 1
 
-// RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-unknown-cygwin < /dev/null | FileCheck -check-prefix I386-DECLSPEC %s
-// I386-DECLSPEC: #define __declspec
+// RUN: %clang_cc1 -E -dM -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=i686-unknown-cygwin < /dev/null | FileCheck -match-full-lines -check-prefix I386-DECLSPEC %s
+// I386-DECLSPEC: #define __declspec{{.*}}
 
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none < /dev/null | FileCheck -check-prefix MIPS32BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS32BE %s
 //
 // MIPS32BE:#define MIPSEB 1
 // MIPS32BE:#define _ABIO32 1
@@ -2846,12 +2862,12 @@
 // MIPS32BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS32BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32BE:#define __FLT_RADIX__ 2
-// MIPS32BE:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT16_C_SUFFIX__
 // MIPS32BE:#define __INT16_FMTd__ "hd"
 // MIPS32BE:#define __INT16_FMTi__ "hi"
 // MIPS32BE:#define __INT16_MAX__ 32767
 // MIPS32BE:#define __INT16_TYPE__ short
-// MIPS32BE:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT32_C_SUFFIX__
 // MIPS32BE:#define __INT32_FMTd__ "d"
 // MIPS32BE:#define __INT32_FMTi__ "i"
 // MIPS32BE:#define __INT32_MAX__ 2147483647
@@ -2861,7 +2877,7 @@
 // MIPS32BE:#define __INT64_FMTi__ "lli"
 // MIPS32BE:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INT64_TYPE__ long long int
-// MIPS32BE:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __INT8_C_SUFFIX__
 // MIPS32BE:#define __INT8_FMTd__ "hhd"
 // MIPS32BE:#define __INT8_FMTi__ "hhi"
 // MIPS32BE:#define __INT8_MAX__ 127
@@ -2932,7 +2948,7 @@
 // MIPS32BE:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS32BE:#define __PTRDIFF_TYPE__ int
 // MIPS32BE:#define __PTRDIFF_WIDTH__ 32
-// MIPS32BE:#define __REGISTER_PREFIX__ 
+// MIPS32BE:#define __REGISTER_PREFIX__
 // MIPS32BE:#define __SCHAR_MAX__ 127
 // MIPS32BE:#define __SHRT_MAX__ 32767
 // MIPS32BE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -2955,7 +2971,7 @@
 // MIPS32BE:#define __STDC_HOSTED__ 0
 // MIPS32BE:#define __STDC_VERSION__ 201112L
 // MIPS32BE:#define __STDC__ 1
-// MIPS32BE:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __UINT16_C_SUFFIX__
 // MIPS32BE:#define __UINT16_MAX__ 65535
 // MIPS32BE:#define __UINT16_TYPE__ unsigned short
 // MIPS32BE:#define __UINT32_C_SUFFIX__ U
@@ -2964,14 +2980,14 @@
 // MIPS32BE:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINT64_TYPE__ long long unsigned int
-// MIPS32BE:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS32BE:#define __UINT8_C_SUFFIX__
 // MIPS32BE:#define __UINT8_MAX__ 255
 // MIPS32BE:#define __UINT8_TYPE__ unsigned char
 // MIPS32BE:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINTMAX_WIDTH__ 64
-// MIPS32BE:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32BE:#define __UINTPTR_MAX__ 4294967295UL
 // MIPS32BE:#define __UINTPTR_TYPE__ long unsigned int
 // MIPS32BE:#define __UINTPTR_WIDTH__ 32
 // MIPS32BE:#define __UINT_FAST16_MAX__ 65535
@@ -2990,7 +3006,7 @@
 // MIPS32BE:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINT_LEAST8_MAX__ 255
 // MIPS32BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS32BE:#define __USER_LABEL_PREFIX__ _
+// MIPS32BE:#define __USER_LABEL_PREFIX__
 // MIPS32BE:#define __WCHAR_MAX__ 2147483647
 // MIPS32BE:#define __WCHAR_TYPE__ int
 // MIPS32BE:#define __WCHAR_WIDTH__ 32
@@ -3006,7 +3022,7 @@
 // MIPS32BE:#define _mips 1
 // MIPS32BE:#define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mipsel-none-none < /dev/null | FileCheck -check-prefix MIPS32EL %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mipsel-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS32EL %s
 //
 // MIPS32EL:#define MIPSEL 1
 // MIPS32EL:#define _ABIO32 1
@@ -3054,12 +3070,12 @@
 // MIPS32EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS32EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32EL:#define __FLT_RADIX__ 2
-// MIPS32EL:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT16_C_SUFFIX__
 // MIPS32EL:#define __INT16_FMTd__ "hd"
 // MIPS32EL:#define __INT16_FMTi__ "hi"
 // MIPS32EL:#define __INT16_MAX__ 32767
 // MIPS32EL:#define __INT16_TYPE__ short
-// MIPS32EL:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT32_C_SUFFIX__
 // MIPS32EL:#define __INT32_FMTd__ "d"
 // MIPS32EL:#define __INT32_FMTi__ "i"
 // MIPS32EL:#define __INT32_MAX__ 2147483647
@@ -3069,7 +3085,7 @@
 // MIPS32EL:#define __INT64_FMTi__ "lli"
 // MIPS32EL:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INT64_TYPE__ long long int
-// MIPS32EL:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __INT8_C_SUFFIX__
 // MIPS32EL:#define __INT8_FMTd__ "hhd"
 // MIPS32EL:#define __INT8_FMTi__ "hhi"
 // MIPS32EL:#define __INT8_MAX__ 127
@@ -3141,7 +3157,7 @@
 // MIPS32EL:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS32EL:#define __PTRDIFF_TYPE__ int
 // MIPS32EL:#define __PTRDIFF_WIDTH__ 32
-// MIPS32EL:#define __REGISTER_PREFIX__ 
+// MIPS32EL:#define __REGISTER_PREFIX__
 // MIPS32EL:#define __SCHAR_MAX__ 127
 // MIPS32EL:#define __SHRT_MAX__ 32767
 // MIPS32EL:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -3161,7 +3177,7 @@
 // MIPS32EL:#define __SIZE_MAX__ 4294967295U
 // MIPS32EL:#define __SIZE_TYPE__ unsigned int
 // MIPS32EL:#define __SIZE_WIDTH__ 32
-// MIPS32EL:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __UINT16_C_SUFFIX__
 // MIPS32EL:#define __UINT16_MAX__ 65535
 // MIPS32EL:#define __UINT16_TYPE__ unsigned short
 // MIPS32EL:#define __UINT32_C_SUFFIX__ U
@@ -3170,14 +3186,14 @@
 // MIPS32EL:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINT64_TYPE__ long long unsigned int
-// MIPS32EL:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS32EL:#define __UINT8_C_SUFFIX__
 // MIPS32EL:#define __UINT8_MAX__ 255
 // MIPS32EL:#define __UINT8_TYPE__ unsigned char
 // MIPS32EL:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINTMAX_WIDTH__ 64
-// MIPS32EL:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32EL:#define __UINTPTR_MAX__ 4294967295UL
 // MIPS32EL:#define __UINTPTR_TYPE__ long unsigned int
 // MIPS32EL:#define __UINTPTR_WIDTH__ 32
 // MIPS32EL:#define __UINT_FAST16_MAX__ 65535
@@ -3196,7 +3212,7 @@
 // MIPS32EL:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINT_LEAST8_MAX__ 255
 // MIPS32EL:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS32EL:#define __USER_LABEL_PREFIX__ _
+// MIPS32EL:#define __USER_LABEL_PREFIX__
 // MIPS32EL:#define __WCHAR_MAX__ 2147483647
 // MIPS32EL:#define __WCHAR_TYPE__ int
 // MIPS32EL:#define __WCHAR_WIDTH__ 32
@@ -3214,7 +3230,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:            -triple=mips64-none-none -target-abi n32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPSN32BE %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPSN32BE %s
 //
 // MIPSN32BE: #define MIPSEB 1
 // MIPSN32BE: #define _ABIN32 2
@@ -3286,7 +3302,6 @@
 // MIPSN32BE: #define __GNUC_STDC_INLINE__ 1
 // MIPSN32BE: #define __GNUC__ 4
 // MIPSN32BE: #define __GXX_ABI_VERSION 1002
-// MIPSN32BE: #define __GXX_RTTI 1
 // MIPSN32BE: #define __ILP32__ 1
 // MIPSN32BE: #define __INT16_C_SUFFIX__
 // MIPSN32BE: #define __INT16_FMTd__ "hd"
@@ -3500,7 +3515,7 @@
 // MIPSN32BE: #define __UINT_LEAST8_FMTx__ "hhx"
 // MIPSN32BE: #define __UINT_LEAST8_MAX__ 255
 // MIPSN32BE: #define __UINT_LEAST8_TYPE__ unsigned char
-// MIPSN32BE: #define __USER_LABEL_PREFIX__ _
+// MIPSN32BE: #define __USER_LABEL_PREFIX__
 // MIPSN32BE: #define __WCHAR_MAX__ 2147483647
 // MIPSN32BE: #define __WCHAR_TYPE__ int
 // MIPSN32BE: #define __WCHAR_WIDTH__ 32
@@ -3521,7 +3536,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:            -triple=mips64el-none-none -target-abi n32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPSN32EL %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPSN32EL %s
 //
 // MIPSN32EL: #define MIPSEL 1
 // MIPSN32EL: #define _ABIN32 2
@@ -3592,7 +3607,6 @@
 // MIPSN32EL: #define __GNUC_STDC_INLINE__ 1
 // MIPSN32EL: #define __GNUC__ 4
 // MIPSN32EL: #define __GXX_ABI_VERSION 1002
-// MIPSN32EL: #define __GXX_RTTI 1
 // MIPSN32EL: #define __ILP32__ 1
 // MIPSN32EL: #define __INT16_C_SUFFIX__
 // MIPSN32EL: #define __INT16_FMTd__ "hd"
@@ -3807,7 +3821,7 @@
 // MIPSN32EL: #define __UINT_LEAST8_FMTx__ "hhx"
 // MIPSN32EL: #define __UINT_LEAST8_MAX__ 255
 // MIPSN32EL: #define __UINT_LEAST8_TYPE__ unsigned char
-// MIPSN32EL: #define __USER_LABEL_PREFIX__ _
+// MIPSN32EL: #define __USER_LABEL_PREFIX__
 // MIPSN32EL: #define __WCHAR_MAX__ 2147483647
 // MIPSN32EL: #define __WCHAR_TYPE__ int
 // MIPSN32EL: #define __WCHAR_WIDTH__ 32
@@ -3826,7 +3840,7 @@
 // MIPSN32EL: #define _mips 1
 // MIPSN32EL: #define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none < /dev/null | FileCheck -check-prefix MIPS64BE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS64BE %s
 //
 // MIPS64BE:#define MIPSEB 1
 // MIPS64BE:#define _ABI64 3
@@ -3875,12 +3889,12 @@
 // MIPS64BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS64BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64BE:#define __FLT_RADIX__ 2
-// MIPS64BE:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT16_C_SUFFIX__
 // MIPS64BE:#define __INT16_FMTd__ "hd"
 // MIPS64BE:#define __INT16_FMTi__ "hi"
 // MIPS64BE:#define __INT16_MAX__ 32767
 // MIPS64BE:#define __INT16_TYPE__ short
-// MIPS64BE:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT32_C_SUFFIX__
 // MIPS64BE:#define __INT32_FMTd__ "d"
 // MIPS64BE:#define __INT32_FMTi__ "i"
 // MIPS64BE:#define __INT32_MAX__ 2147483647
@@ -3890,7 +3904,7 @@
 // MIPS64BE:#define __INT64_FMTi__ "li"
 // MIPS64BE:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64BE:#define __INT64_TYPE__ long int
-// MIPS64BE:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __INT8_C_SUFFIX__
 // MIPS64BE:#define __INT8_FMTd__ "hhd"
 // MIPS64BE:#define __INT8_FMTi__ "hhi"
 // MIPS64BE:#define __INT8_MAX__ 127
@@ -3961,7 +3975,7 @@
 // MIPS64BE:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS64BE:#define __PTRDIFF_TYPE__ long int
 // MIPS64BE:#define __PTRDIFF_WIDTH__ 64
-// MIPS64BE:#define __REGISTER_PREFIX__ 
+// MIPS64BE:#define __REGISTER_PREFIX__
 // MIPS64BE:#define __SCHAR_MAX__ 127
 // MIPS64BE:#define __SHRT_MAX__ 32767
 // MIPS64BE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -3982,7 +3996,7 @@
 // MIPS64BE:#define __SIZE_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __SIZE_TYPE__ long unsigned int
 // MIPS64BE:#define __SIZE_WIDTH__ 64
-// MIPS64BE:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __UINT16_C_SUFFIX__
 // MIPS64BE:#define __UINT16_MAX__ 65535
 // MIPS64BE:#define __UINT16_TYPE__ unsigned short
 // MIPS64BE:#define __UINT32_C_SUFFIX__ U
@@ -3991,7 +4005,7 @@
 // MIPS64BE:#define __UINT64_C_SUFFIX__ UL
 // MIPS64BE:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __UINT64_TYPE__ long unsigned int
-// MIPS64BE:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS64BE:#define __UINT8_C_SUFFIX__
 // MIPS64BE:#define __UINT8_MAX__ 255
 // MIPS64BE:#define __UINT8_TYPE__ unsigned char
 // MIPS64BE:#define __UINTMAX_C_SUFFIX__ UL
@@ -4017,7 +4031,7 @@
 // MIPS64BE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // MIPS64BE:#define __UINT_LEAST8_MAX__ 255
 // MIPS64BE:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS64BE:#define __USER_LABEL_PREFIX__ _
+// MIPS64BE:#define __USER_LABEL_PREFIX__
 // MIPS64BE:#define __WCHAR_MAX__ 2147483647
 // MIPS64BE:#define __WCHAR_TYPE__ int
 // MIPS64BE:#define __WCHAR_WIDTH__ 32
@@ -4035,7 +4049,7 @@
 // MIPS64BE:#define _mips 1
 // MIPS64BE:#define mips 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-none-none < /dev/null | FileCheck -check-prefix MIPS64EL %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64el-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MIPS64EL %s
 //
 // MIPS64EL:#define MIPSEL 1
 // MIPS64EL:#define _ABI64 3
@@ -4083,12 +4097,12 @@
 // MIPS64EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS64EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64EL:#define __FLT_RADIX__ 2
-// MIPS64EL:#define __INT16_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT16_C_SUFFIX__
 // MIPS64EL:#define __INT16_FMTd__ "hd"
 // MIPS64EL:#define __INT16_FMTi__ "hi"
 // MIPS64EL:#define __INT16_MAX__ 32767
 // MIPS64EL:#define __INT16_TYPE__ short
-// MIPS64EL:#define __INT32_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT32_C_SUFFIX__
 // MIPS64EL:#define __INT32_FMTd__ "d"
 // MIPS64EL:#define __INT32_FMTi__ "i"
 // MIPS64EL:#define __INT32_MAX__ 2147483647
@@ -4098,7 +4112,7 @@
 // MIPS64EL:#define __INT64_FMTi__ "li"
 // MIPS64EL:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64EL:#define __INT64_TYPE__ long int
-// MIPS64EL:#define __INT8_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __INT8_C_SUFFIX__
 // MIPS64EL:#define __INT8_FMTd__ "hhd"
 // MIPS64EL:#define __INT8_FMTi__ "hhi"
 // MIPS64EL:#define __INT8_MAX__ 127
@@ -4170,7 +4184,7 @@
 // MIPS64EL:#define __PRAGMA_REDEFINE_EXTNAME 1
 // MIPS64EL:#define __PTRDIFF_TYPE__ long int
 // MIPS64EL:#define __PTRDIFF_WIDTH__ 64
-// MIPS64EL:#define __REGISTER_PREFIX__ 
+// MIPS64EL:#define __REGISTER_PREFIX__
 // MIPS64EL:#define __SCHAR_MAX__ 127
 // MIPS64EL:#define __SHRT_MAX__ 32767
 // MIPS64EL:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -4191,7 +4205,7 @@
 // MIPS64EL:#define __SIZE_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __SIZE_TYPE__ long unsigned int
 // MIPS64EL:#define __SIZE_WIDTH__ 64
-// MIPS64EL:#define __UINT16_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __UINT16_C_SUFFIX__
 // MIPS64EL:#define __UINT16_MAX__ 65535
 // MIPS64EL:#define __UINT16_TYPE__ unsigned short
 // MIPS64EL:#define __UINT32_C_SUFFIX__ U
@@ -4200,7 +4214,7 @@
 // MIPS64EL:#define __UINT64_C_SUFFIX__ UL
 // MIPS64EL:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __UINT64_TYPE__ long unsigned int
-// MIPS64EL:#define __UINT8_C_SUFFIX__ {{$}}
+// MIPS64EL:#define __UINT8_C_SUFFIX__
 // MIPS64EL:#define __UINT8_MAX__ 255
 // MIPS64EL:#define __UINT8_TYPE__ unsigned char
 // MIPS64EL:#define __UINTMAX_C_SUFFIX__ UL
@@ -4226,7 +4240,7 @@
 // MIPS64EL:#define __UINT_LEAST64_TYPE__ long unsigned int
 // MIPS64EL:#define __UINT_LEAST8_MAX__ 255
 // MIPS64EL:#define __UINT_LEAST8_TYPE__ unsigned char
-// MIPS64EL:#define __USER_LABEL_PREFIX__ _
+// MIPS64EL:#define __USER_LABEL_PREFIX__
 // MIPS64EL:#define __WCHAR_MAX__ 2147483647
 // MIPS64EL:#define __WCHAR_TYPE__ int
 // MIPS64EL:#define __WCHAR_WIDTH__ 32
@@ -4248,7 +4262,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-DEF32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-DEF32 %s
 //
 // MIPS-ARCH-DEF32:#define _MIPS_ARCH "mips32r2"
 // MIPS-ARCH-DEF32:#define _MIPS_ARCH_MIPS32R2 1
@@ -4257,7 +4271,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-nones \
 // RUN:            -target-cpu mips32 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32 %s
 //
 // MIPS-ARCH-32:#define _MIPS_ARCH "mips32"
 // MIPS-ARCH-32:#define _MIPS_ARCH_MIPS32 1
@@ -4266,7 +4280,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r2 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R2 %s
 //
 // MIPS-ARCH-32R2:#define _MIPS_ARCH "mips32r2"
 // MIPS-ARCH-32R2:#define _MIPS_ARCH_MIPS32R2 1
@@ -4275,7 +4289,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r3 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R3 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R3 %s
 //
 // MIPS-ARCH-32R3:#define _MIPS_ARCH "mips32r3"
 // MIPS-ARCH-32R3:#define _MIPS_ARCH_MIPS32R3 1
@@ -4284,7 +4298,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r5 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R5 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R5 %s
 //
 // MIPS-ARCH-32R5:#define _MIPS_ARCH "mips32r5"
 // MIPS-ARCH-32R5:#define _MIPS_ARCH_MIPS32R5 1
@@ -4293,7 +4307,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips-none-none \
 // RUN:            -target-cpu mips32r6 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-32R6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-32R6 %s
 //
 // MIPS-ARCH-32R6:#define _MIPS_ARCH "mips32r6"
 // MIPS-ARCH-32R6:#define _MIPS_ARCH_MIPS32R6 1
@@ -4302,7 +4316,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-DEF64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-DEF64 %s
 //
 // MIPS-ARCH-DEF64:#define _MIPS_ARCH "mips64r2"
 // MIPS-ARCH-DEF64:#define _MIPS_ARCH_MIPS64R2 1
@@ -4311,7 +4325,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64 %s
 //
 // MIPS-ARCH-64:#define _MIPS_ARCH "mips64"
 // MIPS-ARCH-64:#define _MIPS_ARCH_MIPS64 1
@@ -4320,7 +4334,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r2 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R2 %s
 //
 // MIPS-ARCH-64R2:#define _MIPS_ARCH "mips64r2"
 // MIPS-ARCH-64R2:#define _MIPS_ARCH_MIPS64R2 1
@@ -4329,7 +4343,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r3 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R3 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R3 %s
 //
 // MIPS-ARCH-64R3:#define _MIPS_ARCH "mips64r3"
 // MIPS-ARCH-64R3:#define _MIPS_ARCH_MIPS64R3 1
@@ -4338,7 +4352,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r5 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R5 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R5 %s
 //
 // MIPS-ARCH-64R5:#define _MIPS_ARCH "mips64r5"
 // MIPS-ARCH-64R5:#define _MIPS_ARCH_MIPS64R5 1
@@ -4347,7 +4361,7 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=mips64-none-none \
 // RUN:            -target-cpu mips64r6 < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-ARCH-64R6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-ARCH-64R6 %s
 //
 // MIPS-ARCH-64R6:#define _MIPS_ARCH "mips64r6"
 // MIPS-ARCH-64R6:#define _MIPS_ARCH_MIPS64R6 1
@@ -4358,23 +4372,23 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-HARD %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-HARD %s
 // MIPS-FABI-HARD:#define __mips_hard_float 1
 //
 // RUN: %clang_cc1 -target-feature +soft-float -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SOFT %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SOFT %s
 // MIPS-FABI-SOFT:#define __mips_soft_float 1
 //
 // RUN: %clang_cc1 -target-feature +single-float -E -dM -ffreestanding \
 // RUN:   -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SINGLE %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SINGLE %s
 // MIPS-FABI-SINGLE:#define __mips_hard_float 1
 // MIPS-FABI-SINGLE:#define __mips_single_float 1
 //
 // RUN: %clang_cc1 -target-feature +soft-float -target-feature +single-float \
 // RUN:   -E -dM -ffreestanding -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-FABI-SINGLE-SOFT %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-FABI-SINGLE-SOFT %s
 // MIPS-FABI-SINGLE-SOFT:#define __mips_single_float 1
 // MIPS-FABI-SINGLE-SOFT:#define __mips_soft_float 1
 //
@@ -4382,94 +4396,94 @@
 //
 // RUN: %clang_cc1 -target-feature +mips16 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS16 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS16 %s
 // MIPS16:#define __mips16 1
 //
 // RUN: %clang_cc1 -target-feature -mips16 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMIPS16 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMIPS16 %s
 // NOMIPS16-NOT:#define __mips16 1
 //
 // RUN: %clang_cc1 -target-feature +micromips \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MICROMIPS %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MICROMIPS %s
 // MICROMIPS:#define __mips_micromips 1
 //
 // RUN: %clang_cc1 -target-feature -micromips \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMICROMIPS %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMICROMIPS %s
 // NOMICROMIPS-NOT:#define __mips_micromips 1
 //
 // RUN: %clang_cc1 -target-feature +dsp \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-DSP %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-DSP %s
 // MIPS-DSP:#define __mips_dsp 1
 // MIPS-DSP:#define __mips_dsp_rev 1
 // MIPS-DSP-NOT:#define __mips_dspr2 1
 //
 // RUN: %clang_cc1 -target-feature +dspr2 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-DSPR2 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-DSPR2 %s
 // MIPS-DSPR2:#define __mips_dsp 1
 // MIPS-DSPR2:#define __mips_dsp_rev 2
 // MIPS-DSPR2:#define __mips_dspr2 1
 //
 // RUN: %clang_cc1 -target-feature +msa \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-MSA %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-MSA %s
 // MIPS-MSA:#define __mips_msa 1
 //
 // RUN: %clang_cc1 -target-cpu mips32r3 -target-feature +nan2008 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-NAN2008 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-NAN2008 %s
 // MIPS-NAN2008:#define __mips_nan2008 1
 //
 // RUN: %clang_cc1 -target-cpu mips32r3 -target-feature -nan2008 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix NOMIPS-NAN2008 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix NOMIPS-NAN2008 %s
 // NOMIPS-NAN2008-NOT:#define __mips_nan2008 1
 //
 // RUN: %clang_cc1 -target-feature -fp64 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP32 %s
 // MIPS32-MFP32:#define _MIPS_FPSET 16
 // MIPS32-MFP32:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-feature +fp64 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP64 %s
 // MIPS32-MFP64:#define _MIPS_FPSET 32
 // MIPS32-MFP64:#define __mips_fpr 64
 //
 // RUN: %clang_cc1 -target-feature +single-float \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS32-MFP32SF %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS32-MFP32SF %s
 // MIPS32-MFP32SF:#define _MIPS_FPSET 32
 // MIPS32-MFP32SF:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-feature +fp64 \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS64-MFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS64-MFP64 %s
 // MIPS64-MFP64:#define _MIPS_FPSET 32
 // MIPS64-MFP64:#define __mips_fpr 64
 //
 // RUN: %clang_cc1 -target-feature -fp64 -target-feature +single-float \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS64-NOMFP64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS64-NOMFP64 %s
 // MIPS64-NOMFP64:#define _MIPS_FPSET 32
 // MIPS64-NOMFP64:#define __mips_fpr 32
 //
 // RUN: %clang_cc1 -target-cpu mips32r6 \
 // RUN:   -E -dM -triple=mips-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-XXR6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-XXR6 %s
 // RUN: %clang_cc1 -target-cpu mips64r6 \
 // RUN:   -E -dM -triple=mips64-none-none < /dev/null \
-// RUN:   | FileCheck -check-prefix MIPS-XXR6 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix MIPS-XXR6 %s
 // MIPS-XXR6:#define _MIPS_FPSET 32
 // MIPS-XXR6:#define __mips_fpr 64
 // MIPS-XXR6:#define __mips_nan2008 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=msp430-none-none < /dev/null | FileCheck -check-prefix MSP430 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=msp430-none-none < /dev/null | FileCheck -match-full-lines -check-prefix MSP430 %s
 //
 // MSP430:#define MSP430 1
 // MSP430-NOT:#define _LP64
@@ -4507,7 +4521,7 @@
 // MSP430:#define __FLT_MIN_EXP__ (-125)
 // MSP430:#define __FLT_MIN__ 1.17549435e-38F
 // MSP430:#define __FLT_RADIX__ 2
-// MSP430:#define __INT16_C_SUFFIX__ {{$}}
+// MSP430:#define __INT16_C_SUFFIX__
 // MSP430:#define __INT16_FMTd__ "hd"
 // MSP430:#define __INT16_FMTi__ "hi"
 // MSP430:#define __INT16_MAX__ 32767
@@ -4522,7 +4536,7 @@
 // MSP430:#define __INT64_FMTi__ "lli"
 // MSP430:#define __INT64_MAX__ 9223372036854775807LL
 // MSP430:#define __INT64_TYPE__ long long int
-// MSP430:#define __INT8_C_SUFFIX__ {{$}}
+// MSP430:#define __INT8_C_SUFFIX__
 // MSP430:#define __INT8_FMTd__ "hhd"
 // MSP430:#define __INT8_FMTi__ "hhi"
 // MSP430:#define __INT8_MAX__ 127
@@ -4591,10 +4605,10 @@
 // MSP430:#define __MSP430__ 1
 // MSP430:#define __POINTER_WIDTH__ 16
 // MSP430:#define __PTRDIFF_TYPE__ int
-// MSP430:#define __PTRDIFF_WIDTH__ 16 
+// MSP430:#define __PTRDIFF_WIDTH__ 16
 // MSP430:#define __SCHAR_MAX__ 127
 // MSP430:#define __SHRT_MAX__ 32767
-// MSP430:#define __SIG_ATOMIC_MAX__ 2147483647
+// MSP430:#define __SIG_ATOMIC_MAX__ 2147483647L
 // MSP430:#define __SIG_ATOMIC_WIDTH__ 32
 // MSP430:#define __SIZEOF_DOUBLE__ 8
 // MSP430:#define __SIZEOF_FLOAT__ 4
@@ -4608,11 +4622,11 @@
 // MSP430:#define __SIZEOF_SIZE_T__ 2
 // MSP430:#define __SIZEOF_WCHAR_T__ 2
 // MSP430:#define __SIZEOF_WINT_T__ 2
-// MSP430:#define __SIZE_MAX__ 65535
+// MSP430:#define __SIZE_MAX__ 65535U
 // MSP430:#define __SIZE_TYPE__ unsigned int
 // MSP430:#define __SIZE_WIDTH__ 16
 // MSP430:#define __UINT16_C_SUFFIX__ U
-// MSP430:#define __UINT16_MAX__ 65535
+// MSP430:#define __UINT16_MAX__ 65535U
 // MSP430:#define __UINT16_TYPE__ unsigned short
 // MSP430:#define __UINT32_C_SUFFIX__ UL
 // MSP430:#define __UINT32_MAX__ 4294967295UL
@@ -4620,17 +4634,17 @@
 // MSP430:#define __UINT64_C_SUFFIX__ ULL
 // MSP430:#define __UINT64_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINT64_TYPE__ long long unsigned int
-// MSP430:#define __UINT8_C_SUFFIX__ {{$}}
+// MSP430:#define __UINT8_C_SUFFIX__
 // MSP430:#define __UINT8_MAX__ 255
 // MSP430:#define __UINT8_TYPE__ unsigned char
 // MSP430:#define __UINTMAX_C_SUFFIX__ ULL
 // MSP430:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINTMAX_TYPE__ long long unsigned int
 // MSP430:#define __UINTMAX_WIDTH__ 64
-// MSP430:#define __UINTPTR_MAX__ 65535
+// MSP430:#define __UINTPTR_MAX__ 65535U
 // MSP430:#define __UINTPTR_TYPE__ unsigned int
 // MSP430:#define __UINTPTR_WIDTH__ 16
-// MSP430:#define __UINT_FAST16_MAX__ 65535
+// MSP430:#define __UINT_FAST16_MAX__ 65535U
 // MSP430:#define __UINT_FAST16_TYPE__ unsigned short
 // MSP430:#define __UINT_FAST32_MAX__ 4294967295UL
 // MSP430:#define __UINT_FAST32_TYPE__ long unsigned int
@@ -4638,7 +4652,7 @@
 // MSP430:#define __UINT_FAST64_TYPE__ long long unsigned int
 // MSP430:#define __UINT_FAST8_MAX__ 255
 // MSP430:#define __UINT_FAST8_TYPE__ unsigned char
-// MSP430:#define __UINT_LEAST16_MAX__ 65535
+// MSP430:#define __UINT_LEAST16_MAX__ 65535U
 // MSP430:#define __UINT_LEAST16_TYPE__ unsigned short
 // MSP430:#define __UINT_LEAST32_MAX__ 4294967295UL
 // MSP430:#define __UINT_LEAST32_TYPE__ long unsigned int
@@ -4646,7 +4660,7 @@
 // MSP430:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // MSP430:#define __UINT_LEAST8_MAX__ 255
 // MSP430:#define __UINT_LEAST8_TYPE__ unsigned char
-// MSP430:#define __USER_LABEL_PREFIX__ _
+// MSP430:#define __USER_LABEL_PREFIX__
 // MSP430:#define __WCHAR_MAX__ 32767
 // MSP430:#define __WCHAR_TYPE__ int
 // MSP430:#define __WCHAR_WIDTH__ 16
@@ -4654,7 +4668,7 @@
 // MSP430:#define __WINT_WIDTH__ 16
 // MSP430:#define __clang__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx-none-none < /dev/null | FileCheck -check-prefix NVPTX32 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx-none-none < /dev/null | FileCheck -match-full-lines -check-prefix NVPTX32 %s
 //
 // NVPTX32-NOT:#define _LP64
 // NVPTX32:#define __BIGGEST_ALIGNMENT__ 8
@@ -4693,12 +4707,12 @@
 // NVPTX32:#define __FLT_MIN_EXP__ (-125)
 // NVPTX32:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX32:#define __FLT_RADIX__ 2
-// NVPTX32:#define __INT16_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT16_C_SUFFIX__
 // NVPTX32:#define __INT16_FMTd__ "hd"
 // NVPTX32:#define __INT16_FMTi__ "hi"
 // NVPTX32:#define __INT16_MAX__ 32767
 // NVPTX32:#define __INT16_TYPE__ short
-// NVPTX32:#define __INT32_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT32_C_SUFFIX__
 // NVPTX32:#define __INT32_FMTd__ "d"
 // NVPTX32:#define __INT32_FMTi__ "i"
 // NVPTX32:#define __INT32_MAX__ 2147483647
@@ -4706,9 +4720,9 @@
 // NVPTX32:#define __INT64_C_SUFFIX__ LL
 // NVPTX32:#define __INT64_FMTd__ "lld"
 // NVPTX32:#define __INT64_FMTi__ "lli"
-// NVPTX32:#define __INT64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT64_TYPE__ long long int
-// NVPTX32:#define __INT8_C_SUFFIX__ {{$}}
+// NVPTX32:#define __INT8_C_SUFFIX__
 // NVPTX32:#define __INT8_FMTd__ "hhd"
 // NVPTX32:#define __INT8_FMTi__ "hhi"
 // NVPTX32:#define __INT8_MAX__ 127
@@ -4734,7 +4748,7 @@
 // NVPTX32:#define __INT_FAST32_TYPE__ int
 // NVPTX32:#define __INT_FAST64_FMTd__ "lld"
 // NVPTX32:#define __INT_FAST64_FMTi__ "lli"
-// NVPTX32:#define __INT_FAST64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT_FAST64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT_FAST64_TYPE__ long long int
 // NVPTX32:#define __INT_FAST8_FMTd__ "hhd"
 // NVPTX32:#define __INT_FAST8_FMTi__ "hhi"
@@ -4750,7 +4764,7 @@
 // NVPTX32:#define __INT_LEAST32_TYPE__ int
 // NVPTX32:#define __INT_LEAST64_FMTd__ "lld"
 // NVPTX32:#define __INT_LEAST64_FMTi__ "lli"
-// NVPTX32:#define __INT_LEAST64_MAX__ 9223372036854775807L
+// NVPTX32:#define __INT_LEAST64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT_LEAST64_TYPE__ long long int
 // NVPTX32:#define __INT_LEAST8_FMTd__ "hhd"
 // NVPTX32:#define __INT_LEAST8_FMTi__ "hhi"
@@ -4799,7 +4813,7 @@
 // NVPTX32:#define __SIZE_MAX__ 4294967295U
 // NVPTX32:#define __SIZE_TYPE__ unsigned int
 // NVPTX32:#define __SIZE_WIDTH__ 32
-// NVPTX32:#define __UINT16_C_SUFFIX__ {{$}}
+// NVPTX32:#define __UINT16_C_SUFFIX__
 // NVPTX32:#define __UINT16_MAX__ 65535
 // NVPTX32:#define __UINT16_TYPE__ unsigned short
 // NVPTX32:#define __UINT32_C_SUFFIX__ U
@@ -4808,7 +4822,7 @@
 // NVPTX32:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX32:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT64_TYPE__ long long unsigned int
-// NVPTX32:#define __UINT8_C_SUFFIX__ {{$}}
+// NVPTX32:#define __UINT8_C_SUFFIX__
 // NVPTX32:#define __UINT8_MAX__ 255
 // NVPTX32:#define __UINT8_TYPE__ unsigned char
 // NVPTX32:#define __UINTMAX_C_SUFFIX__ ULL
@@ -4822,7 +4836,7 @@
 // NVPTX32:#define __UINT_FAST16_TYPE__ unsigned short
 // NVPTX32:#define __UINT_FAST32_MAX__ 4294967295U
 // NVPTX32:#define __UINT_FAST32_TYPE__ unsigned int
-// NVPTX32:#define __UINT_FAST64_MAX__ 18446744073709551615UL
+// NVPTX32:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT_FAST64_TYPE__ long long unsigned int
 // NVPTX32:#define __UINT_FAST8_MAX__ 255
 // NVPTX32:#define __UINT_FAST8_TYPE__ unsigned char
@@ -4830,18 +4844,18 @@
 // NVPTX32:#define __UINT_LEAST16_TYPE__ unsigned short
 // NVPTX32:#define __UINT_LEAST32_MAX__ 4294967295U
 // NVPTX32:#define __UINT_LEAST32_TYPE__ unsigned int
-// NVPTX32:#define __UINT_LEAST64_MAX__ 18446744073709551615UL
+// NVPTX32:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // NVPTX32:#define __UINT_LEAST8_MAX__ 255
 // NVPTX32:#define __UINT_LEAST8_TYPE__ unsigned char
-// NVPTX32:#define __USER_LABEL_PREFIX__ _
+// NVPTX32:#define __USER_LABEL_PREFIX__
 // NVPTX32:#define __WCHAR_MAX__ 2147483647
 // NVPTX32:#define __WCHAR_TYPE__ int
 // NVPTX32:#define __WCHAR_WIDTH__ 32
 // NVPTX32:#define __WINT_TYPE__ int
 // NVPTX32:#define __WINT_WIDTH__ 32
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx64-none-none < /dev/null | FileCheck -check-prefix NVPTX64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=nvptx64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix NVPTX64 %s
 //
 // NVPTX64:#define _LP64 1
 // NVPTX64:#define __BIGGEST_ALIGNMENT__ 8
@@ -4880,12 +4894,12 @@
 // NVPTX64:#define __FLT_MIN_EXP__ (-125)
 // NVPTX64:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX64:#define __FLT_RADIX__ 2
-// NVPTX64:#define __INT16_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT16_C_SUFFIX__
 // NVPTX64:#define __INT16_FMTd__ "hd"
 // NVPTX64:#define __INT16_FMTi__ "hi"
 // NVPTX64:#define __INT16_MAX__ 32767
 // NVPTX64:#define __INT16_TYPE__ short
-// NVPTX64:#define __INT32_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT32_C_SUFFIX__
 // NVPTX64:#define __INT32_FMTd__ "d"
 // NVPTX64:#define __INT32_FMTi__ "i"
 // NVPTX64:#define __INT32_MAX__ 2147483647
@@ -4893,9 +4907,9 @@
 // NVPTX64:#define __INT64_C_SUFFIX__ LL
 // NVPTX64:#define __INT64_FMTd__ "lld"
 // NVPTX64:#define __INT64_FMTi__ "lli"
-// NVPTX64:#define __INT64_MAX__ 9223372036854775807L
+// NVPTX64:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX64:#define __INT64_TYPE__ long long int
-// NVPTX64:#define __INT8_C_SUFFIX__ {{$}}
+// NVPTX64:#define __INT8_C_SUFFIX__
 // NVPTX64:#define __INT8_FMTd__ "hhd"
 // NVPTX64:#define __INT8_FMTi__ "hhi"
 // NVPTX64:#define __INT8_MAX__ 127
@@ -4986,7 +5000,7 @@
 // NVPTX64:#define __SIZE_MAX__ 18446744073709551615UL
 // NVPTX64:#define __SIZE_TYPE__ long unsigned int
 // NVPTX64:#define __SIZE_WIDTH__ 64
-// NVPTX64:#define __UINT16_C_SUFFIX__ {{$}}
+// NVPTX64:#define __UINT16_C_SUFFIX__
 // NVPTX64:#define __UINT16_MAX__ 65535
 // NVPTX64:#define __UINT16_TYPE__ unsigned short
 // NVPTX64:#define __UINT32_C_SUFFIX__ U
@@ -4995,7 +5009,7 @@
 // NVPTX64:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX64:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX64:#define __UINT64_TYPE__ long long unsigned int
-// NVPTX64:#define __UINT8_C_SUFFIX__ {{$}}
+// NVPTX64:#define __UINT8_C_SUFFIX__
 // NVPTX64:#define __UINT8_MAX__ 255
 // NVPTX64:#define __UINT8_TYPE__ unsigned char
 // NVPTX64:#define __UINTMAX_C_SUFFIX__ ULL
@@ -5021,14 +5035,14 @@
 // NVPTX64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // NVPTX64:#define __UINT_LEAST8_MAX__ 255
 // NVPTX64:#define __UINT_LEAST8_TYPE__ unsigned char
-// NVPTX64:#define __USER_LABEL_PREFIX__ _
+// NVPTX64:#define __USER_LABEL_PREFIX__
 // NVPTX64:#define __WCHAR_MAX__ 2147483647
 // NVPTX64:#define __WCHAR_TYPE__ int
 // NVPTX64:#define __WCHAR_WIDTH__ 32
 // NVPTX64:#define __WINT_TYPE__ int
 // NVPTX64:#define __WINT_WIDTH__ 32
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-cpu 603e < /dev/null | FileCheck -check-prefix PPC603E %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -target-cpu 603e < /dev/null | FileCheck -match-full-lines -check-prefix PPC603E %s
 //
 // PPC603E:#define _ARCH_603 1
 // PPC603E:#define _ARCH_603E 1
@@ -5071,12 +5085,12 @@
 // PPC603E:#define __FLT_MIN_EXP__ (-125)
 // PPC603E:#define __FLT_MIN__ 1.17549435e-38F
 // PPC603E:#define __FLT_RADIX__ 2
-// PPC603E:#define __INT16_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT16_C_SUFFIX__
 // PPC603E:#define __INT16_FMTd__ "hd"
 // PPC603E:#define __INT16_FMTi__ "hi"
 // PPC603E:#define __INT16_MAX__ 32767
 // PPC603E:#define __INT16_TYPE__ short
-// PPC603E:#define __INT32_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT32_C_SUFFIX__
 // PPC603E:#define __INT32_FMTd__ "d"
 // PPC603E:#define __INT32_FMTi__ "i"
 // PPC603E:#define __INT32_MAX__ 2147483647
@@ -5086,7 +5100,7 @@
 // PPC603E:#define __INT64_FMTi__ "lli"
 // PPC603E:#define __INT64_MAX__ 9223372036854775807LL
 // PPC603E:#define __INT64_TYPE__ long long int
-// PPC603E:#define __INT8_C_SUFFIX__ {{$}}
+// PPC603E:#define __INT8_C_SUFFIX__
 // PPC603E:#define __INT8_FMTd__ "hhd"
 // PPC603E:#define __INT8_FMTi__ "hhi"
 // PPC603E:#define __INT8_MAX__ 127
@@ -5175,10 +5189,10 @@
 // PPC603E:#define __SIZEOF_SIZE_T__ 4
 // PPC603E:#define __SIZEOF_WCHAR_T__ 4
 // PPC603E:#define __SIZEOF_WINT_T__ 4
-// PPC603E:#define __SIZE_MAX__ 4294967295U
+// PPC603E:#define __SIZE_MAX__ 4294967295UL
 // PPC603E:#define __SIZE_TYPE__ long unsigned int
 // PPC603E:#define __SIZE_WIDTH__ 32
-// PPC603E:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC603E:#define __UINT16_C_SUFFIX__
 // PPC603E:#define __UINT16_MAX__ 65535
 // PPC603E:#define __UINT16_TYPE__ unsigned short
 // PPC603E:#define __UINT32_C_SUFFIX__ U
@@ -5187,14 +5201,14 @@
 // PPC603E:#define __UINT64_C_SUFFIX__ ULL
 // PPC603E:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINT64_TYPE__ long long unsigned int
-// PPC603E:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC603E:#define __UINT8_C_SUFFIX__
 // PPC603E:#define __UINT8_MAX__ 255
 // PPC603E:#define __UINT8_TYPE__ unsigned char
 // PPC603E:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC603E:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC603E:#define __UINTMAX_WIDTH__ 64
-// PPC603E:#define __UINTPTR_MAX__ 4294967295U
+// PPC603E:#define __UINTPTR_MAX__ 4294967295UL
 // PPC603E:#define __UINTPTR_TYPE__ long unsigned int
 // PPC603E:#define __UINTPTR_WIDTH__ 32
 // PPC603E:#define __UINT_FAST16_MAX__ 65535
@@ -5213,7 +5227,7 @@
 // PPC603E:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // PPC603E:#define __UINT_LEAST8_MAX__ 255
 // PPC603E:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC603E:#define __USER_LABEL_PREFIX__ _
+// PPC603E:#define __USER_LABEL_PREFIX__
 // PPC603E:#define __WCHAR_MAX__ 2147483647
 // PPC603E:#define __WCHAR_TYPE__ int
 // PPC603E:#define __WCHAR_WIDTH__ 32
@@ -5222,7 +5236,7 @@
 // PPC603E:#define __powerpc__ 1
 // PPC603E:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64 %s
 //
 // PPC64:#define _ARCH_PPC 1
 // PPC64:#define _ARCH_PPC64 1
@@ -5270,12 +5284,12 @@
 // PPC64:#define __FLT_MIN_EXP__ (-125)
 // PPC64:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64:#define __FLT_RADIX__ 2
-// PPC64:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64:#define __INT16_C_SUFFIX__
 // PPC64:#define __INT16_FMTd__ "hd"
 // PPC64:#define __INT16_FMTi__ "hi"
 // PPC64:#define __INT16_MAX__ 32767
 // PPC64:#define __INT16_TYPE__ short
-// PPC64:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64:#define __INT32_C_SUFFIX__
 // PPC64:#define __INT32_FMTd__ "d"
 // PPC64:#define __INT32_FMTi__ "i"
 // PPC64:#define __INT32_MAX__ 2147483647
@@ -5285,7 +5299,7 @@
 // PPC64:#define __INT64_FMTi__ "li"
 // PPC64:#define __INT64_MAX__ 9223372036854775807L
 // PPC64:#define __INT64_TYPE__ long int
-// PPC64:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64:#define __INT8_C_SUFFIX__
 // PPC64:#define __INT8_FMTd__ "hhd"
 // PPC64:#define __INT8_FMTi__ "hhi"
 // PPC64:#define __INT8_MAX__ 127
@@ -5358,7 +5372,7 @@
 // PPC64:#define __PPC__ 1
 // PPC64:#define __PTRDIFF_TYPE__ long int
 // PPC64:#define __PTRDIFF_WIDTH__ 64
-// PPC64:#define __REGISTER_PREFIX__ 
+// PPC64:#define __REGISTER_PREFIX__
 // PPC64:#define __SCHAR_MAX__ 127
 // PPC64:#define __SHRT_MAX__ 32767
 // PPC64:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -5378,7 +5392,7 @@
 // PPC64:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64:#define __SIZE_TYPE__ long unsigned int
 // PPC64:#define __SIZE_WIDTH__ 64
-// PPC64:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64:#define __UINT16_C_SUFFIX__
 // PPC64:#define __UINT16_MAX__ 65535
 // PPC64:#define __UINT16_TYPE__ unsigned short
 // PPC64:#define __UINT32_C_SUFFIX__ U
@@ -5387,7 +5401,7 @@
 // PPC64:#define __UINT64_C_SUFFIX__ UL
 // PPC64:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64:#define __UINT64_TYPE__ long unsigned int
-// PPC64:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64:#define __UINT8_C_SUFFIX__
 // PPC64:#define __UINT8_MAX__ 255
 // PPC64:#define __UINT8_TYPE__ unsigned char
 // PPC64:#define __UINTMAX_C_SUFFIX__ UL
@@ -5413,7 +5427,7 @@
 // PPC64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // PPC64:#define __UINT_LEAST8_MAX__ 255
 // PPC64:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC64:#define __USER_LABEL_PREFIX__ _
+// PPC64:#define __USER_LABEL_PREFIX__
 // PPC64:#define __WCHAR_MAX__ 2147483647
 // PPC64:#define __WCHAR_TYPE__ int
 // PPC64:#define __WCHAR_WIDTH__ 32
@@ -5422,7 +5436,7 @@
 // PPC64:#define __ppc64__ 1
 // PPC64:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64LE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64LE %s
 //
 // PPC64LE:#define _ARCH_PPC 1
 // PPC64LE:#define _ARCH_PPC64 1
@@ -5472,12 +5486,12 @@
 // PPC64LE:#define __FLT_MIN_EXP__ (-125)
 // PPC64LE:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64LE:#define __FLT_RADIX__ 2
-// PPC64LE:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT16_C_SUFFIX__
 // PPC64LE:#define __INT16_FMTd__ "hd"
 // PPC64LE:#define __INT16_FMTi__ "hi"
 // PPC64LE:#define __INT16_MAX__ 32767
 // PPC64LE:#define __INT16_TYPE__ short
-// PPC64LE:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT32_C_SUFFIX__
 // PPC64LE:#define __INT32_FMTd__ "d"
 // PPC64LE:#define __INT32_FMTi__ "i"
 // PPC64LE:#define __INT32_MAX__ 2147483647
@@ -5487,7 +5501,7 @@
 // PPC64LE:#define __INT64_FMTi__ "li"
 // PPC64LE:#define __INT64_MAX__ 9223372036854775807L
 // PPC64LE:#define __INT64_TYPE__ long int
-// PPC64LE:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64LE:#define __INT8_C_SUFFIX__
 // PPC64LE:#define __INT8_FMTd__ "hhd"
 // PPC64LE:#define __INT8_FMTi__ "hhi"
 // PPC64LE:#define __INT8_MAX__ 127
@@ -5561,7 +5575,7 @@
 // PPC64LE:#define __PPC__ 1
 // PPC64LE:#define __PTRDIFF_TYPE__ long int
 // PPC64LE:#define __PTRDIFF_WIDTH__ 64
-// PPC64LE:#define __REGISTER_PREFIX__ 
+// PPC64LE:#define __REGISTER_PREFIX__
 // PPC64LE:#define __SCHAR_MAX__ 127
 // PPC64LE:#define __SHRT_MAX__ 32767
 // PPC64LE:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -5581,7 +5595,7 @@
 // PPC64LE:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64LE:#define __SIZE_TYPE__ long unsigned int
 // PPC64LE:#define __SIZE_WIDTH__ 64
-// PPC64LE:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64LE:#define __UINT16_C_SUFFIX__
 // PPC64LE:#define __UINT16_MAX__ 65535
 // PPC64LE:#define __UINT16_TYPE__ unsigned short
 // PPC64LE:#define __UINT32_C_SUFFIX__ U
@@ -5590,7 +5604,7 @@
 // PPC64LE:#define __UINT64_C_SUFFIX__ UL
 // PPC64LE:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64LE:#define __UINT64_TYPE__ long unsigned int
-// PPC64LE:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64LE:#define __UINT8_C_SUFFIX__
 // PPC64LE:#define __UINT8_MAX__ 255
 // PPC64LE:#define __UINT8_TYPE__ unsigned char
 // PPC64LE:#define __UINTMAX_C_SUFFIX__ UL
@@ -5616,7 +5630,7 @@
 // PPC64LE:#define __UINT_LEAST64_TYPE__ long unsigned int
 // PPC64LE:#define __UINT_LEAST8_MAX__ 255
 // PPC64LE:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC64LE:#define __USER_LABEL_PREFIX__ _
+// PPC64LE:#define __USER_LABEL_PREFIX__
 // PPC64LE:#define __WCHAR_MAX__ 2147483647
 // PPC64LE:#define __WCHAR_TYPE__ int
 // PPC64LE:#define __WCHAR_WIDTH__ 32
@@ -5625,7 +5639,7 @@
 // PPC64LE:#define __ppc64__ 1
 // PPC64LE:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -check-prefix PPCA2Q %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu a2q -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCA2Q %s
 //
 // PPCA2Q:#define _ARCH_A2 1
 // PPCA2Q:#define _ARCH_A2Q 1
@@ -5633,33 +5647,33 @@
 // PPCA2Q:#define _ARCH_PPC64 1
 // PPCA2Q:#define _ARCH_QP 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -check-prefix PPCBGQ %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-bgq-linux -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCBGQ %s
 //
 // PPCBGQ:#define __THW_BLUEGENE__ 1
 // PPCBGQ:#define __TOS_BGQ__ 1
 // PPCBGQ:#define __bg__ 1
 // PPCBGQ:#define __bgq__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC630 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu 630 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC630 %s
 //
 // PPC630:#define _ARCH_630 1
 // PPC630:#define _ARCH_PPC 1
 // PPC630:#define _ARCH_PPC64 1
 // PPC630:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr3 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR3 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr3 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR3 %s
 //
 // PPCPWR3:#define _ARCH_PPC 1
 // PPCPWR3:#define _ARCH_PPC64 1
 // PPCPWR3:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power3 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER3 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power3 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER3 %s
 //
 // PPCPOWER3:#define _ARCH_PPC 1
 // PPCPOWER3:#define _ARCH_PPC64 1
 // PPCPOWER3:#define _ARCH_PPCGR 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr4 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr4 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR4 %s
 //
 // PPCPWR4:#define _ARCH_PPC 1
 // PPCPWR4:#define _ARCH_PPC64 1
@@ -5667,7 +5681,7 @@
 // PPCPWR4:#define _ARCH_PPCSQ 1
 // PPCPWR4:#define _ARCH_PWR4 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power4 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power4 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER4 %s
 //
 // PPCPOWER4:#define _ARCH_PPC 1
 // PPCPOWER4:#define _ARCH_PPC64 1
@@ -5675,7 +5689,7 @@
 // PPCPOWER4:#define _ARCH_PPCSQ 1
 // PPCPOWER4:#define _ARCH_PWR4 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5 %s
 //
 // PPCPWR5:#define _ARCH_PPC 1
 // PPCPWR5:#define _ARCH_PPC64 1
@@ -5684,7 +5698,7 @@
 // PPCPWR5:#define _ARCH_PWR4 1
 // PPCPWR5:#define _ARCH_PWR5 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER5 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER5 %s
 //
 // PPCPOWER5:#define _ARCH_PPC 1
 // PPCPOWER5:#define _ARCH_PPC64 1
@@ -5693,7 +5707,7 @@
 // PPCPOWER5:#define _ARCH_PWR4 1
 // PPCPOWER5:#define _ARCH_PWR5 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR5X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr5x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR5X %s
 //
 // PPCPWR5X:#define _ARCH_PPC 1
 // PPCPWR5X:#define _ARCH_PPC64 1
@@ -5703,7 +5717,7 @@
 // PPCPWR5X:#define _ARCH_PWR5 1
 // PPCPWR5X:#define _ARCH_PWR5X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER5X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power5x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER5X %s
 //
 // PPCPOWER5X:#define _ARCH_PPC 1
 // PPCPOWER5X:#define _ARCH_PPC64 1
@@ -5713,7 +5727,7 @@
 // PPCPOWER5X:#define _ARCH_PWR5 1
 // PPCPOWER5X:#define _ARCH_PWR5X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR6 %s
 //
 // PPCPWR6:#define _ARCH_PPC 1
 // PPCPWR6:#define _ARCH_PPC64 1
@@ -5724,7 +5738,7 @@
 // PPCPWR6:#define _ARCH_PWR5X 1
 // PPCPWR6:#define _ARCH_PWR6 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER6 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER6 %s
 //
 // PPCPOWER6:#define _ARCH_PPC 1
 // PPCPOWER6:#define _ARCH_PPC64 1
@@ -5735,7 +5749,7 @@
 // PPCPOWER6:#define _ARCH_PWR5X 1
 // PPCPOWER6:#define _ARCH_PWR6 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR6X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr6x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR6X %s
 //
 // PPCPWR6X:#define _ARCH_PPC 1
 // PPCPWR6X:#define _ARCH_PPC64 1
@@ -5747,7 +5761,7 @@
 // PPCPWR6X:#define _ARCH_PWR6 1
 // PPCPWR6X:#define _ARCH_PWR6X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6x -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER6X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power6x -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER6X %s
 //
 // PPCPOWER6X:#define _ARCH_PPC 1
 // PPCPOWER6X:#define _ARCH_PPC64 1
@@ -5759,7 +5773,7 @@
 // PPCPOWER6X:#define _ARCH_PWR6 1
 // PPCPOWER6X:#define _ARCH_PWR6X 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR7 %s
 //
 // PPCPWR7:#define _ARCH_PPC 1
 // PPCPWR7:#define _ARCH_PPC64 1
@@ -5772,7 +5786,7 @@
 // PPCPWR7:#define _ARCH_PWR6X 1
 // PPCPWR7:#define _ARCH_PWR7 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power7 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER7 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power7 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER7 %s
 //
 // PPCPOWER7:#define _ARCH_PPC 1
 // PPCPOWER7:#define _ARCH_PPC64 1
@@ -5785,7 +5799,7 @@
 // PPCPOWER7:#define _ARCH_PWR6X 1
 // PPCPOWER7:#define _ARCH_PWR7 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPWR8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr8 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR8 %s
 //
 // PPCPWR8:#define _ARCH_PPC 1
 // PPCPWR8:#define _ARCH_PPC64 1
@@ -5799,7 +5813,7 @@
 // PPCPWR8:#define _ARCH_PWR7 1
 // PPCPWR8:#define _ARCH_PWR8 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPCPOWER8 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER8 %s
 //
 // PPCPOWER8:#define _ARCH_PPC 1
 // PPCPOWER8:#define _ARCH_PPC64 1
@@ -5813,7 +5827,38 @@
 // PPCPOWER8:#define _ARCH_PWR7 1
 // PPCPOWER8:#define _ARCH_PWR8 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -check-prefix PPC64-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu pwr9 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPWR9 %s
+//
+// PPCPWR9:#define _ARCH_PPC 1
+// PPCPWR9:#define _ARCH_PPC64 1
+// PPCPWR9:#define _ARCH_PPCGR 1
+// PPCPWR9:#define _ARCH_PPCSQ 1
+// PPCPWR9:#define _ARCH_PWR4 1
+// PPCPWR9:#define _ARCH_PWR5 1
+// PPCPWR9:#define _ARCH_PWR5X 1
+// PPCPWR9:#define _ARCH_PWR6 1
+// PPCPWR9:#define _ARCH_PWR6X 1
+// PPCPWR9:#define _ARCH_PWR7 1
+// PPCPWR9:#define _ARCH_PWR9 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-cpu power9 -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPCPOWER9 %s
+//
+// PPCPOWER9:#define _ARCH_PPC 1
+// PPCPOWER9:#define _ARCH_PPC64 1
+// PPCPOWER9:#define _ARCH_PPCGR 1
+// PPCPOWER9:#define _ARCH_PPCSQ 1
+// PPCPOWER9:#define _ARCH_PWR4 1
+// PPCPOWER9:#define _ARCH_PWR5 1
+// PPCPOWER9:#define _ARCH_PWR5X 1
+// PPCPOWER9:#define _ARCH_PWR6 1
+// PPCPOWER9:#define _ARCH_PWR6X 1
+// PPCPOWER9:#define _ARCH_PWR7 1
+// PPCPOWER9:#define _ARCH_PWR9 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-none-none -target-feature +float128 -target-cpu power8 -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-FLOAT128 %s
+// PPC-FLOAT128:#define __FLOAT128__ 1
+//
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-LINUX %s
 //
 // PPC64-LINUX:#define _ARCH_PPC 1
 // PPC64-LINUX:#define _ARCH_PPC64 1
@@ -5855,12 +5900,12 @@
 // PPC64-LINUX:#define __FLT_MIN_EXP__ (-125)
 // PPC64-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64-LINUX:#define __FLT_RADIX__ 2
-// PPC64-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT16_C_SUFFIX__
 // PPC64-LINUX:#define __INT16_FMTd__ "hd"
 // PPC64-LINUX:#define __INT16_FMTi__ "hi"
 // PPC64-LINUX:#define __INT16_MAX__ 32767
 // PPC64-LINUX:#define __INT16_TYPE__ short
-// PPC64-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT32_C_SUFFIX__
 // PPC64-LINUX:#define __INT32_FMTd__ "d"
 // PPC64-LINUX:#define __INT32_FMTi__ "i"
 // PPC64-LINUX:#define __INT32_MAX__ 2147483647
@@ -5870,7 +5915,7 @@
 // PPC64-LINUX:#define __INT64_FMTi__ "li"
 // PPC64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // PPC64-LINUX:#define __INT64_TYPE__ long int
-// PPC64-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __INT8_C_SUFFIX__
 // PPC64-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC64-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC64-LINUX:#define __INT8_MAX__ 127
@@ -5963,7 +6008,7 @@
 // PPC64-LINUX:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __SIZE_TYPE__ long unsigned int
 // PPC64-LINUX:#define __SIZE_WIDTH__ 64
-// PPC64-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __UINT16_C_SUFFIX__
 // PPC64-LINUX:#define __UINT16_MAX__ 65535
 // PPC64-LINUX:#define __UINT16_TYPE__ unsigned short
 // PPC64-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -5972,7 +6017,7 @@
 // PPC64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // PPC64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __UINT64_TYPE__ long unsigned int
-// PPC64-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC64-LINUX:#define __UINT8_C_SUFFIX__
 // PPC64-LINUX:#define __UINT8_MAX__ 255
 // PPC64-LINUX:#define __UINT8_TYPE__ unsigned char
 // PPC64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
@@ -6010,17 +6055,17 @@
 // PPC64-LINUX:#define __ppc64__ 1
 // PPC64-LINUX:#define __ppc__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -check-prefix PPC64-ELFv1 %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv1-qpx < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv1 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-ELFv2 %s
 // PPC64-ELFv1:#define _CALL_ELF 1
 // PPC64-ELFv2:#define _CALL_ELF 2
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -fno-signed-char < /dev/null | FileCheck -check-prefix PPC %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-none-none -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC %s
 //
 // PPC:#define _ARCH_PPC 1
 // PPC:#define _BIG_ENDIAN 1
@@ -6061,12 +6106,12 @@
 // PPC:#define __FLT_MIN_EXP__ (-125)
 // PPC:#define __FLT_MIN__ 1.17549435e-38F
 // PPC:#define __FLT_RADIX__ 2
-// PPC:#define __INT16_C_SUFFIX__ {{$}}
+// PPC:#define __INT16_C_SUFFIX__
 // PPC:#define __INT16_FMTd__ "hd"
 // PPC:#define __INT16_FMTi__ "hi"
 // PPC:#define __INT16_MAX__ 32767
 // PPC:#define __INT16_TYPE__ short
-// PPC:#define __INT32_C_SUFFIX__ {{$}}
+// PPC:#define __INT32_C_SUFFIX__
 // PPC:#define __INT32_FMTd__ "d"
 // PPC:#define __INT32_FMTi__ "i"
 // PPC:#define __INT32_MAX__ 2147483647
@@ -6076,7 +6121,7 @@
 // PPC:#define __INT64_FMTi__ "lli"
 // PPC:#define __INT64_MAX__ 9223372036854775807LL
 // PPC:#define __INT64_TYPE__ long long int
-// PPC:#define __INT8_C_SUFFIX__ {{$}}
+// PPC:#define __INT8_C_SUFFIX__
 // PPC:#define __INT8_FMTd__ "hhd"
 // PPC:#define __INT8_FMTi__ "hhi"
 // PPC:#define __INT8_MAX__ 127
@@ -6148,7 +6193,7 @@
 // PPC:#define __PPC__ 1
 // PPC:#define __PTRDIFF_TYPE__ long int
 // PPC:#define __PTRDIFF_WIDTH__ 32
-// PPC:#define __REGISTER_PREFIX__ 
+// PPC:#define __REGISTER_PREFIX__
 // PPC:#define __SCHAR_MAX__ 127
 // PPC:#define __SHRT_MAX__ 32767
 // PPC:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -6165,10 +6210,10 @@
 // PPC:#define __SIZEOF_SIZE_T__ 4
 // PPC:#define __SIZEOF_WCHAR_T__ 4
 // PPC:#define __SIZEOF_WINT_T__ 4
-// PPC:#define __SIZE_MAX__ 4294967295U
+// PPC:#define __SIZE_MAX__ 4294967295UL
 // PPC:#define __SIZE_TYPE__ long unsigned int
 // PPC:#define __SIZE_WIDTH__ 32
-// PPC:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC:#define __UINT16_C_SUFFIX__
 // PPC:#define __UINT16_MAX__ 65535
 // PPC:#define __UINT16_TYPE__ unsigned short
 // PPC:#define __UINT32_C_SUFFIX__ U
@@ -6177,14 +6222,14 @@
 // PPC:#define __UINT64_C_SUFFIX__ ULL
 // PPC:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC:#define __UINT64_TYPE__ long long unsigned int
-// PPC:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC:#define __UINT8_C_SUFFIX__
 // PPC:#define __UINT8_MAX__ 255
 // PPC:#define __UINT8_TYPE__ unsigned char
 // PPC:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC:#define __UINTMAX_WIDTH__ 64
-// PPC:#define __UINTPTR_MAX__ 4294967295U
+// PPC:#define __UINTPTR_MAX__ 4294967295UL
 // PPC:#define __UINTPTR_TYPE__ long unsigned int
 // PPC:#define __UINTPTR_WIDTH__ 32
 // PPC:#define __UINT_FAST16_MAX__ 65535
@@ -6203,7 +6248,7 @@
 // PPC:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // PPC:#define __UINT_LEAST8_MAX__ 255
 // PPC:#define __UINT_LEAST8_TYPE__ unsigned char
-// PPC:#define __USER_LABEL_PREFIX__ _
+// PPC:#define __USER_LABEL_PREFIX__
 // PPC:#define __WCHAR_MAX__ 2147483647
 // PPC:#define __WCHAR_TYPE__ int
 // PPC:#define __WCHAR_WIDTH__ 32
@@ -6211,7 +6256,7 @@
 // PPC:#define __WINT_WIDTH__ 32
 // PPC:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -check-prefix PPC-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix PPC-LINUX %s
 //
 // PPC-LINUX:#define _ARCH_PPC 1
 // PPC-LINUX:#define _BIG_ENDIAN 1
@@ -6252,12 +6297,12 @@
 // PPC-LINUX:#define __FLT_MIN_EXP__ (-125)
 // PPC-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-LINUX:#define __FLT_RADIX__ 2
-// PPC-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT16_C_SUFFIX__
 // PPC-LINUX:#define __INT16_FMTd__ "hd"
 // PPC-LINUX:#define __INT16_FMTi__ "hi"
 // PPC-LINUX:#define __INT16_MAX__ 32767
 // PPC-LINUX:#define __INT16_TYPE__ short
-// PPC-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT32_C_SUFFIX__
 // PPC-LINUX:#define __INT32_FMTd__ "d"
 // PPC-LINUX:#define __INT32_FMTi__ "i"
 // PPC-LINUX:#define __INT32_MAX__ 2147483647
@@ -6267,7 +6312,7 @@
 // PPC-LINUX:#define __INT64_FMTi__ "lli"
 // PPC-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-LINUX:#define __INT64_TYPE__ long long int
-// PPC-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __INT8_C_SUFFIX__
 // PPC-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC-LINUX:#define __INT8_MAX__ 127
@@ -6359,7 +6404,7 @@
 // PPC-LINUX:#define __SIZE_MAX__ 4294967295U
 // PPC-LINUX:#define __SIZE_TYPE__ unsigned int
 // PPC-LINUX:#define __SIZE_WIDTH__ 32
-// PPC-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __UINT16_C_SUFFIX__
 // PPC-LINUX:#define __UINT16_MAX__ 65535
 // PPC-LINUX:#define __UINT16_TYPE__ unsigned short
 // PPC-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -6368,7 +6413,7 @@
 // PPC-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // PPC-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-LINUX:#define __UINT64_TYPE__ long long unsigned int
-// PPC-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC-LINUX:#define __UINT8_C_SUFFIX__
 // PPC-LINUX:#define __UINT8_MAX__ 255
 // PPC-LINUX:#define __UINT8_TYPE__ unsigned char
 // PPC-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
@@ -6404,7 +6449,7 @@
 // PPC-LINUX:#define __powerpc__ 1
 // PPC-LINUX:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-apple-darwin8 < /dev/null | FileCheck -check-prefix PPC-DARWIN %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-apple-darwin8 < /dev/null | FileCheck -match-full-lines -check-prefix PPC-DARWIN %s
 //
 // PPC-DARWIN:#define _ARCH_PPC 1
 // PPC-DARWIN:#define _BIG_ENDIAN 1
@@ -6443,12 +6488,12 @@
 // PPC-DARWIN:#define __FLT_MIN_EXP__ (-125)
 // PPC-DARWIN:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-DARWIN:#define __FLT_RADIX__ 2
-// PPC-DARWIN:#define __INT16_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT16_C_SUFFIX__
 // PPC-DARWIN:#define __INT16_FMTd__ "hd"
 // PPC-DARWIN:#define __INT16_FMTi__ "hi"
 // PPC-DARWIN:#define __INT16_MAX__ 32767
 // PPC-DARWIN:#define __INT16_TYPE__ short
-// PPC-DARWIN:#define __INT32_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT32_C_SUFFIX__
 // PPC-DARWIN:#define __INT32_FMTd__ "d"
 // PPC-DARWIN:#define __INT32_FMTi__ "i"
 // PPC-DARWIN:#define __INT32_MAX__ 2147483647
@@ -6458,7 +6503,7 @@
 // PPC-DARWIN:#define __INT64_FMTi__ "lli"
 // PPC-DARWIN:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-DARWIN:#define __INT64_TYPE__ long long int
-// PPC-DARWIN:#define __INT8_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __INT8_C_SUFFIX__
 // PPC-DARWIN:#define __INT8_FMTd__ "hhd"
 // PPC-DARWIN:#define __INT8_FMTi__ "hhi"
 // PPC-DARWIN:#define __INT8_MAX__ 127
@@ -6533,7 +6578,7 @@
 // PPC-DARWIN:#define __PPC__ 1
 // PPC-DARWIN:#define __PTRDIFF_TYPE__ int
 // PPC-DARWIN:#define __PTRDIFF_WIDTH__ 32
-// PPC-DARWIN:#define __REGISTER_PREFIX__ 
+// PPC-DARWIN:#define __REGISTER_PREFIX__
 // PPC-DARWIN:#define __SCHAR_MAX__ 127
 // PPC-DARWIN:#define __SHRT_MAX__ 32767
 // PPC-DARWIN:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -6556,7 +6601,7 @@
 // PPC-DARWIN:#define __STDC_HOSTED__ 0
 // PPC-DARWIN:#define __STDC_VERSION__ 201112L
 // PPC-DARWIN:#define __STDC__ 1
-// PPC-DARWIN:#define __UINT16_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __UINT16_C_SUFFIX__
 // PPC-DARWIN:#define __UINT16_MAX__ 65535
 // PPC-DARWIN:#define __UINT16_TYPE__ unsigned short
 // PPC-DARWIN:#define __UINT32_C_SUFFIX__ U
@@ -6565,14 +6610,14 @@
 // PPC-DARWIN:#define __UINT64_C_SUFFIX__ ULL
 // PPC-DARWIN:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-DARWIN:#define __UINT64_TYPE__ long long unsigned int
-// PPC-DARWIN:#define __UINT8_C_SUFFIX__ {{$}}
+// PPC-DARWIN:#define __UINT8_C_SUFFIX__
 // PPC-DARWIN:#define __UINT8_MAX__ 255
 // PPC-DARWIN:#define __UINT8_TYPE__ unsigned char
 // PPC-DARWIN:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC-DARWIN:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC-DARWIN:#define __UINTMAX_TYPE__ long long unsigned int
 // PPC-DARWIN:#define __UINTMAX_WIDTH__ 64
-// PPC-DARWIN:#define __UINTPTR_MAX__ 4294967295U
+// PPC-DARWIN:#define __UINTPTR_MAX__ 4294967295UL
 // PPC-DARWIN:#define __UINTPTR_TYPE__ long unsigned int
 // PPC-DARWIN:#define __UINTPTR_WIDTH__ 32
 // PPC-DARWIN:#define __UINT_FAST16_MAX__ 65535
@@ -6600,8 +6645,8 @@
 // PPC-DARWIN:#define __powerpc__ 1
 // PPC-DARWIN:#define __ppc__ 1
 //
-// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=amdgcn < /dev/null | FileCheck -check-prefix AMDGCN --check-prefix AMDGPU %s
-// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=r600 -target-cpu caicos < /dev/null | FileCheck --check-prefix AMDGPU %s
+// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=amdgcn < /dev/null | FileCheck -match-full-lines -check-prefix AMDGCN --check-prefix AMDGPU %s
+// RUN: %clang_cc1 -x cl -E -dM -ffreestanding -triple=r600 -target-cpu caicos < /dev/null | FileCheck -match-full-lines --check-prefix AMDGPU %s
 //
 // AMDGPU:#define cl_khr_byte_addressable_store 1
 // AMDGCN:#define cl_khr_fp64 1
@@ -6610,7 +6655,7 @@
 // AMDGPU:#define cl_khr_local_int32_base_atomics 1
 // AMDGPU:#define cl_khr_local_int32_extended_atomics 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=s390x-none-none -fno-signed-char < /dev/null | FileCheck -check-prefix S390X %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=s390x-none-none -fno-signed-char < /dev/null | FileCheck -match-full-lines -check-prefix S390X %s
 //
 // S390X:#define __BIGGEST_ALIGNMENT__ 8
 // S390X:#define __CHAR16_TYPE__ unsigned short
@@ -6646,12 +6691,12 @@
 // S390X:#define __FLT_MIN_EXP__ (-125)
 // S390X:#define __FLT_MIN__ 1.17549435e-38F
 // S390X:#define __FLT_RADIX__ 2
-// S390X:#define __INT16_C_SUFFIX__ {{$}}
+// S390X:#define __INT16_C_SUFFIX__
 // S390X:#define __INT16_FMTd__ "hd"
 // S390X:#define __INT16_FMTi__ "hi"
 // S390X:#define __INT16_MAX__ 32767
 // S390X:#define __INT16_TYPE__ short
-// S390X:#define __INT32_C_SUFFIX__ {{$}}
+// S390X:#define __INT32_C_SUFFIX__
 // S390X:#define __INT32_FMTd__ "d"
 // S390X:#define __INT32_FMTi__ "i"
 // S390X:#define __INT32_MAX__ 2147483647
@@ -6661,7 +6706,7 @@
 // S390X:#define __INT64_FMTi__ "li"
 // S390X:#define __INT64_MAX__ 9223372036854775807L
 // S390X:#define __INT64_TYPE__ long int
-// S390X:#define __INT8_C_SUFFIX__ {{$}}
+// S390X:#define __INT8_C_SUFFIX__
 // S390X:#define __INT8_FMTd__ "hhd"
 // S390X:#define __INT8_FMTi__ "hhi"
 // S390X:#define __INT8_MAX__ 127
@@ -6747,7 +6792,7 @@
 // S390X:#define __SIZEOF_WINT_T__ 4
 // S390X:#define __SIZE_TYPE__ long unsigned int
 // S390X:#define __SIZE_WIDTH__ 64
-// S390X:#define __UINT16_C_SUFFIX__ {{$}}
+// S390X:#define __UINT16_C_SUFFIX__
 // S390X:#define __UINT16_MAX__ 65535
 // S390X:#define __UINT16_TYPE__ unsigned short
 // S390X:#define __UINT32_C_SUFFIX__ U
@@ -6756,7 +6801,7 @@
 // S390X:#define __UINT64_C_SUFFIX__ UL
 // S390X:#define __UINT64_MAX__ 18446744073709551615UL
 // S390X:#define __UINT64_TYPE__ long unsigned int
-// S390X:#define __UINT8_C_SUFFIX__ {{$}}
+// S390X:#define __UINT8_C_SUFFIX__
 // S390X:#define __UINT8_MAX__ 255
 // S390X:#define __UINT8_TYPE__ unsigned char
 // S390X:#define __UINTMAX_C_SUFFIX__ UL
@@ -6782,7 +6827,7 @@
 // S390X:#define __UINT_LEAST64_TYPE__ long unsigned int
 // S390X:#define __UINT_LEAST8_MAX__ 255
 // S390X:#define __UINT_LEAST8_TYPE__ unsigned char
-// S390X:#define __USER_LABEL_PREFIX__ _
+// S390X:#define __USER_LABEL_PREFIX__
 // S390X:#define __WCHAR_MAX__ 2147483647
 // S390X:#define __WCHAR_TYPE__ int
 // S390X:#define __WCHAR_WIDTH__ 32
@@ -6791,7 +6836,10 @@
 // S390X:#define __s390__ 1
 // S390X:#define __s390x__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-none < /dev/null | FileCheck -check-prefix SPARC %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-rtems-elf < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-DEFAULT %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-NETOPENBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC -check-prefix SPARC-NETOPENBSD %s
 //
 // SPARC-NOT:#define _LP64
 // SPARC:#define __BIGGEST_ALIGNMENT__ 8
@@ -6829,12 +6877,12 @@
 // SPARC:#define __FLT_MIN_EXP__ (-125)
 // SPARC:#define __FLT_MIN__ 1.17549435e-38F
 // SPARC:#define __FLT_RADIX__ 2
-// SPARC:#define __INT16_C_SUFFIX__ {{$}}
+// SPARC:#define __INT16_C_SUFFIX__
 // SPARC:#define __INT16_FMTd__ "hd"
 // SPARC:#define __INT16_FMTi__ "hi"
 // SPARC:#define __INT16_MAX__ 32767
 // SPARC:#define __INT16_TYPE__ short
-// SPARC:#define __INT32_C_SUFFIX__ {{$}}
+// SPARC:#define __INT32_C_SUFFIX__
 // SPARC:#define __INT32_FMTd__ "d"
 // SPARC:#define __INT32_FMTi__ "i"
 // SPARC:#define __INT32_MAX__ 2147483647
@@ -6844,7 +6892,7 @@
 // SPARC:#define __INT64_FMTi__ "lli"
 // SPARC:#define __INT64_MAX__ 9223372036854775807LL
 // SPARC:#define __INT64_TYPE__ long long int
-// SPARC:#define __INT8_C_SUFFIX__ {{$}}
+// SPARC:#define __INT8_C_SUFFIX__
 // SPARC:#define __INT8_FMTd__ "hhd"
 // SPARC:#define __INT8_FMTi__ "hhi"
 // SPARC:#define __INT8_MAX__ 127
@@ -6855,10 +6903,14 @@
 // SPARC:#define __INTMAX_MAX__ 9223372036854775807LL
 // SPARC:#define __INTMAX_TYPE__ long long int
 // SPARC:#define __INTMAX_WIDTH__ 64
-// SPARC:#define __INTPTR_FMTd__ "d"
-// SPARC:#define __INTPTR_FMTi__ "i"
-// SPARC:#define __INTPTR_MAX__ 2147483647
-// SPARC:#define __INTPTR_TYPE__ int
+// SPARC-DEFAULT:#define __INTPTR_FMTd__ "d"
+// SPARC-DEFAULT:#define __INTPTR_FMTi__ "i"
+// SPARC-DEFAULT:#define __INTPTR_MAX__ 2147483647
+// SPARC-DEFAULT:#define __INTPTR_TYPE__ int
+// SPARC-NETOPENBSD:#define __INTPTR_FMTd__ "ld"
+// SPARC-NETOPENBSD:#define __INTPTR_FMTi__ "li"
+// SPARC-NETOPENBSD:#define __INTPTR_MAX__ 2147483647L
+// SPARC-NETOPENBSD:#define __INTPTR_TYPE__ long int
 // SPARC:#define __INTPTR_WIDTH__ 32
 // SPARC:#define __INT_FAST16_FMTd__ "hd"
 // SPARC:#define __INT_FAST16_FMTi__ "hi"
@@ -6910,7 +6962,8 @@
 // SPARC:#define __LONG_MAX__ 2147483647L
 // SPARC-NOT:#define __LP64__
 // SPARC:#define __POINTER_WIDTH__ 32
-// SPARC:#define __PTRDIFF_TYPE__ int
+// SPARC-DEFAULT:#define __PTRDIFF_TYPE__ int
+// SPARC-NETOPENBSD:#define __PTRDIFF_TYPE__ long int
 // SPARC:#define __PTRDIFF_WIDTH__ 32
 // SPARC:#define __REGISTER_PREFIX__
 // SPARC:#define __SCHAR_MAX__ 127
@@ -6929,10 +6982,12 @@
 // SPARC:#define __SIZEOF_SIZE_T__ 4
 // SPARC:#define __SIZEOF_WCHAR_T__ 4
 // SPARC:#define __SIZEOF_WINT_T__ 4
-// SPARC:#define __SIZE_MAX__ 4294967295U
-// SPARC:#define __SIZE_TYPE__ unsigned int
+// SPARC-DEFAULT:#define __SIZE_MAX__ 4294967295U
+// SPARC-DEFAULT:#define __SIZE_TYPE__ unsigned int
+// SPARC-NETOPENBSD:#define __SIZE_MAX__ 4294967295UL
+// SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int
 // SPARC:#define __SIZE_WIDTH__ 32
-// SPARC:#define __UINT16_C_SUFFIX__ {{$}}
+// SPARC:#define __UINT16_C_SUFFIX__
 // SPARC:#define __UINT16_MAX__ 65535
 // SPARC:#define __UINT16_TYPE__ unsigned short
 // SPARC:#define __UINT32_C_SUFFIX__ U
@@ -6941,15 +6996,17 @@
 // SPARC:#define __UINT64_C_SUFFIX__ ULL
 // SPARC:#define __UINT64_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINT64_TYPE__ long long unsigned int
-// SPARC:#define __UINT8_C_SUFFIX__ {{$}}
+// SPARC:#define __UINT8_C_SUFFIX__
 // SPARC:#define __UINT8_MAX__ 255
 // SPARC:#define __UINT8_TYPE__ unsigned char
 // SPARC:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINTMAX_TYPE__ long long unsigned int
 // SPARC:#define __UINTMAX_WIDTH__ 64
-// SPARC:#define __UINTPTR_MAX__ 4294967295U
-// SPARC:#define __UINTPTR_TYPE__ unsigned int
+// SPARC-DEFAULT:#define __UINTPTR_MAX__ 4294967295U
+// SPARC-DEFAULT:#define __UINTPTR_TYPE__ unsigned int
+// SPARC-NETOPENBSD:#define __UINTPTR_MAX__ 4294967295UL
+// SPARC-NETOPENBSD:#define __UINTPTR_TYPE__ long unsigned int
 // SPARC:#define __UINTPTR_WIDTH__ 32
 // SPARC:#define __UINT_FAST16_MAX__ 65535
 // SPARC:#define __UINT_FAST16_TYPE__ unsigned short
@@ -6967,8 +7024,8 @@
 // SPARC:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // SPARC:#define __UINT_LEAST8_MAX__ 255
 // SPARC:#define __UINT_LEAST8_TYPE__ unsigned char
-// SPARC:#define __USER_LABEL_PREFIX__ _
-// SPARC:#define __VERSION__ "4.2.1 Compatible
+// SPARC:#define __USER_LABEL_PREFIX__
+// SPARC:#define __VERSION__ "4.2.1 Compatible{{.*}}
 // SPARC:#define __WCHAR_MAX__ 2147483647
 // SPARC:#define __WCHAR_TYPE__ int
 // SPARC:#define __WCHAR_WIDTH__ 32
@@ -6978,18 +7035,8 @@
 // SPARC:#define __sparc__ 1
 // SPARC:#define __sparcv8 1
 // SPARC:#define sparc 1
-// 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-netbsd < /dev/null | FileCheck -check-prefix SPARC-NETOPENBSD %s
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc-none-openbsd < /dev/null | FileCheck -check-prefix SPARC-NETOPENBSD %s
-// SPARC-NETOPENBSD:#define __INTPTR_FMTd__ "ld"
-// SPARC-NETOPENBSD:#define __INTPTR_FMTi__ "li"
-// SPARC-NETOPENBSD:#define __INTPTR_MAX__ 2147483647L
-// SPARC-NETOPENBSD:#define __INTPTR_TYPE__ long int
-// SPARC-NETOPENBSD:#define __PTRDIFF_TYPE__ long int
-// SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int
-// SPARC-NETOPENBSD:#define __UINTPTR_TYPE__ long unsigned int
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=tce-none-none < /dev/null | FileCheck -check-prefix TCE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=tce-none-none < /dev/null | FileCheck -match-full-lines -check-prefix TCE %s
 //
 // TCE-NOT:#define _LP64
 // TCE:#define __BIGGEST_ALIGNMENT__ 4
@@ -7027,17 +7074,17 @@
 // TCE:#define __FLT_MIN_EXP__ (-125)
 // TCE:#define __FLT_MIN__ 1.17549435e-38F
 // TCE:#define __FLT_RADIX__ 2
-// TCE:#define __INT16_C_SUFFIX__ {{$}}
+// TCE:#define __INT16_C_SUFFIX__
 // TCE:#define __INT16_FMTd__ "hd"
 // TCE:#define __INT16_FMTi__ "hi"
 // TCE:#define __INT16_MAX__ 32767
 // TCE:#define __INT16_TYPE__ short
-// TCE:#define __INT32_C_SUFFIX__ {{$}}
+// TCE:#define __INT32_C_SUFFIX__
 // TCE:#define __INT32_FMTd__ "d"
 // TCE:#define __INT32_FMTi__ "i"
 // TCE:#define __INT32_MAX__ 2147483647
 // TCE:#define __INT32_TYPE__ int
-// TCE:#define __INT8_C_SUFFIX__ {{$}}
+// TCE:#define __INT8_C_SUFFIX__
 // TCE:#define __INT8_FMTd__ "hhd"
 // TCE:#define __INT8_FMTi__ "hhi"
 // TCE:#define __INT8_MAX__ 127
@@ -7118,13 +7165,13 @@
 // TCE:#define __SIZE_WIDTH__ 32
 // TCE:#define __TCE_V1__ 1
 // TCE:#define __TCE__ 1
-// TCE:#define __UINT16_C_SUFFIX__ {{$}}
+// TCE:#define __UINT16_C_SUFFIX__
 // TCE:#define __UINT16_MAX__ 65535
 // TCE:#define __UINT16_TYPE__ unsigned short
 // TCE:#define __UINT32_C_SUFFIX__ U
 // TCE:#define __UINT32_MAX__ 4294967295U
 // TCE:#define __UINT32_TYPE__ unsigned int
-// TCE:#define __UINT8_C_SUFFIX__ {{$}}
+// TCE:#define __UINT8_C_SUFFIX__
 // TCE:#define __UINT8_MAX__ 255
 // TCE:#define __UINT8_TYPE__ unsigned char
 // TCE:#define __UINTMAX_C_SUFFIX__ UL
@@ -7146,7 +7193,7 @@
 // TCE:#define __UINT_LEAST32_TYPE__ unsigned int
 // TCE:#define __UINT_LEAST8_MAX__ 255
 // TCE:#define __UINT_LEAST8_TYPE__ unsigned char
-// TCE:#define __USER_LABEL_PREFIX__ _
+// TCE:#define __USER_LABEL_PREFIX__
 // TCE:#define __WCHAR_MAX__ 2147483647
 // TCE:#define __WCHAR_TYPE__ int
 // TCE:#define __WCHAR_WIDTH__ 32
@@ -7156,7 +7203,7 @@
 // TCE:#define __tce__ 1
 // TCE:#define tce 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none < /dev/null | FileCheck -check-prefix X86_64 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix X86_64 %s
 //
 // X86_64:#define _LP64 1
 // X86_64-NOT:#define _LP32 1
@@ -7194,12 +7241,12 @@
 // X86_64:#define __FLT_MIN_EXP__ (-125)
 // X86_64:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64:#define __FLT_RADIX__ 2
-// X86_64:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64:#define __INT16_C_SUFFIX__
 // X86_64:#define __INT16_FMTd__ "hd"
 // X86_64:#define __INT16_FMTi__ "hi"
 // X86_64:#define __INT16_MAX__ 32767
 // X86_64:#define __INT16_TYPE__ short
-// X86_64:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64:#define __INT32_C_SUFFIX__
 // X86_64:#define __INT32_FMTd__ "d"
 // X86_64:#define __INT32_FMTi__ "i"
 // X86_64:#define __INT32_MAX__ 2147483647
@@ -7209,7 +7256,7 @@
 // X86_64:#define __INT64_FMTi__ "li"
 // X86_64:#define __INT64_MAX__ 9223372036854775807L
 // X86_64:#define __INT64_TYPE__ long int
-// X86_64:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64:#define __INT8_C_SUFFIX__
 // X86_64:#define __INT8_FMTd__ "hhd"
 // X86_64:#define __INT8_FMTi__ "hhi"
 // X86_64:#define __INT8_MAX__ 127
@@ -7281,7 +7328,7 @@
 // X86_64:#define __POINTER_WIDTH__ 64
 // X86_64:#define __PTRDIFF_TYPE__ long int
 // X86_64:#define __PTRDIFF_WIDTH__ 64
-// X86_64:#define __REGISTER_PREFIX__ 
+// X86_64:#define __REGISTER_PREFIX__
 // X86_64:#define __SCHAR_MAX__ 127
 // X86_64:#define __SHRT_MAX__ 32767
 // X86_64:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7305,7 +7352,7 @@
 // X86_64:#define __SSE2__ 1
 // X86_64:#define __SSE_MATH__ 1
 // X86_64:#define __SSE__ 1
-// X86_64:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64:#define __UINT16_C_SUFFIX__
 // X86_64:#define __UINT16_MAX__ 65535
 // X86_64:#define __UINT16_TYPE__ unsigned short
 // X86_64:#define __UINT32_C_SUFFIX__ U
@@ -7314,7 +7361,7 @@
 // X86_64:#define __UINT64_C_SUFFIX__ UL
 // X86_64:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64:#define __UINT64_TYPE__ long unsigned int
-// X86_64:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64:#define __UINT8_C_SUFFIX__
 // X86_64:#define __UINT8_MAX__ 255
 // X86_64:#define __UINT8_TYPE__ unsigned char
 // X86_64:#define __UINTMAX_C_SUFFIX__ UL
@@ -7340,7 +7387,7 @@
 // X86_64:#define __UINT_LEAST64_TYPE__ long unsigned int
 // X86_64:#define __UINT_LEAST8_MAX__ 255
 // X86_64:#define __UINT_LEAST8_TYPE__ unsigned char
-// X86_64:#define __USER_LABEL_PREFIX__ _
+// X86_64:#define __USER_LABEL_PREFIX__
 // X86_64:#define __WCHAR_MAX__ 2147483647
 // X86_64:#define __WCHAR_TYPE__ int
 // X86_64:#define __WCHAR_WIDTH__ 32
@@ -7351,14 +7398,14 @@
 // X86_64:#define __x86_64 1
 // X86_64:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64h-none-none < /dev/null | FileCheck -check-prefix X86_64H %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64h-none-none < /dev/null | FileCheck -match-full-lines -check-prefix X86_64H %s
 //
 // X86_64H:#define __x86_64 1
 // X86_64H:#define __x86_64__ 1
 // X86_64H:#define __x86_64h 1
 // X86_64H:#define __x86_64h__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -check-prefix X32 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 %s
 //
 // X32:#define _ILP32 1
 // X32-NOT:#define _LP64 1
@@ -7398,22 +7445,22 @@
 // X32:#define __FLT_RADIX__ 2
 // X32:#define __ILP32__ 1
 // X32-NOT:#define __LP64__ 1
-// X32:#define __INT16_C_SUFFIX__ {{$}}
+// X32:#define __INT16_C_SUFFIX__
 // X32:#define __INT16_FMTd__ "hd"
 // X32:#define __INT16_FMTi__ "hi"
 // X32:#define __INT16_MAX__ 32767
 // X32:#define __INT16_TYPE__ short
-// X32:#define __INT32_C_SUFFIX__ {{$}}
+// X32:#define __INT32_C_SUFFIX__
 // X32:#define __INT32_FMTd__ "d"
 // X32:#define __INT32_FMTi__ "i"
 // X32:#define __INT32_MAX__ 2147483647
 // X32:#define __INT32_TYPE__ int
-// X32:#define __INT64_C_SUFFIX__ L
+// X32:#define __INT64_C_SUFFIX__ LL
 // X32:#define __INT64_FMTd__ "lld"
 // X32:#define __INT64_FMTi__ "lli"
-// X32:#define __INT64_MAX__ 9223372036854775807L
+// X32:#define __INT64_MAX__ 9223372036854775807LL
 // X32:#define __INT64_TYPE__ long long int
-// X32:#define __INT8_C_SUFFIX__ {{$}}
+// X32:#define __INT8_C_SUFFIX__
 // X32:#define __INT8_FMTd__ "hhd"
 // X32:#define __INT8_FMTi__ "hhi"
 // X32:#define __INT8_MAX__ 127
@@ -7421,7 +7468,7 @@
 // X32:#define __INTMAX_C_SUFFIX__ LL
 // X32:#define __INTMAX_FMTd__ "lld"
 // X32:#define __INTMAX_FMTi__ "lli"
-// X32:#define __INTMAX_MAX__ 9223372036854775807L
+// X32:#define __INTMAX_MAX__ 9223372036854775807LL
 // X32:#define __INTMAX_TYPE__ long long int
 // X32:#define __INTMAX_WIDTH__ 64
 // X32:#define __INTPTR_FMTd__ "d"
@@ -7439,7 +7486,7 @@
 // X32:#define __INT_FAST32_TYPE__ int
 // X32:#define __INT_FAST64_FMTd__ "lld"
 // X32:#define __INT_FAST64_FMTi__ "lli"
-// X32:#define __INT_FAST64_MAX__ 9223372036854775807L
+// X32:#define __INT_FAST64_MAX__ 9223372036854775807LL
 // X32:#define __INT_FAST64_TYPE__ long long int
 // X32:#define __INT_FAST8_FMTd__ "hhd"
 // X32:#define __INT_FAST8_FMTi__ "hhi"
@@ -7455,7 +7502,7 @@
 // X32:#define __INT_LEAST32_TYPE__ int
 // X32:#define __INT_LEAST64_FMTd__ "lld"
 // X32:#define __INT_LEAST64_FMTi__ "lli"
-// X32:#define __INT_LEAST64_MAX__ 9223372036854775807L
+// X32:#define __INT_LEAST64_MAX__ 9223372036854775807LL
 // X32:#define __INT_LEAST64_TYPE__ long long int
 // X32:#define __INT_LEAST8_FMTd__ "hhd"
 // X32:#define __INT_LEAST8_FMTi__ "hhi"
@@ -7483,7 +7530,7 @@
 // X32:#define __POINTER_WIDTH__ 32
 // X32:#define __PTRDIFF_TYPE__ int
 // X32:#define __PTRDIFF_WIDTH__ 32
-// X32:#define __REGISTER_PREFIX__ 
+// X32:#define __REGISTER_PREFIX__
 // X32:#define __SCHAR_MAX__ 127
 // X32:#define __SHRT_MAX__ 32767
 // X32:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7507,16 +7554,16 @@
 // X32:#define __SSE2__ 1
 // X32:#define __SSE_MATH__ 1
 // X32:#define __SSE__ 1
-// X32:#define __UINT16_C_SUFFIX__ {{$}}
+// X32:#define __UINT16_C_SUFFIX__
 // X32:#define __UINT16_MAX__ 65535
 // X32:#define __UINT16_TYPE__ unsigned short
 // X32:#define __UINT32_C_SUFFIX__ U
 // X32:#define __UINT32_MAX__ 4294967295U
 // X32:#define __UINT32_TYPE__ unsigned int
-// X32:#define __UINT64_C_SUFFIX__ UL
+// X32:#define __UINT64_C_SUFFIX__ ULL
 // X32:#define __UINT64_MAX__ 18446744073709551615ULL
 // X32:#define __UINT64_TYPE__ long long unsigned int
-// X32:#define __UINT8_C_SUFFIX__ {{$}}
+// X32:#define __UINT8_C_SUFFIX__
 // X32:#define __UINT8_MAX__ 255
 // X32:#define __UINT8_TYPE__ unsigned char
 // X32:#define __UINTMAX_C_SUFFIX__ ULL
@@ -7542,7 +7589,7 @@
 // X32:#define __UINT_LEAST64_TYPE__ long long unsigned int
 // X32:#define __UINT_LEAST8_MAX__ 255
 // X32:#define __UINT_LEAST8_TYPE__ unsigned char
-// X32:#define __USER_LABEL_PREFIX__ _
+// X32:#define __USER_LABEL_PREFIX__
 // X32:#define __WCHAR_MAX__ 2147483647
 // X32:#define __WCHAR_TYPE__ int
 // X32:#define __WCHAR_WIDTH__ 32
@@ -7553,7 +7600,7 @@
 // X32:#define __x86_64 1
 // X32:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-cloudabi < /dev/null | FileCheck -check-prefix X86_64-CLOUDABI %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-cloudabi < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-CLOUDABI %s
 //
 // X86_64-CLOUDABI:#define _LP64 1
 // X86_64-CLOUDABI:#define __ATOMIC_ACQUIRE 2
@@ -7618,13 +7665,12 @@
 // X86_64-CLOUDABI:#define __GNUC_STDC_INLINE__ 1
 // X86_64-CLOUDABI:#define __GNUC__ 4
 // X86_64-CLOUDABI:#define __GXX_ABI_VERSION 1002
-// X86_64-CLOUDABI:#define __GXX_RTTI 1
-// X86_64-CLOUDABI:#define __INT16_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT16_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT16_FMTd__ "hd"
 // X86_64-CLOUDABI:#define __INT16_FMTi__ "hi"
 // X86_64-CLOUDABI:#define __INT16_MAX__ 32767
 // X86_64-CLOUDABI:#define __INT16_TYPE__ short
-// X86_64-CLOUDABI:#define __INT32_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT32_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT32_FMTd__ "d"
 // X86_64-CLOUDABI:#define __INT32_FMTi__ "i"
 // X86_64-CLOUDABI:#define __INT32_MAX__ 2147483647
@@ -7634,7 +7680,7 @@
 // X86_64-CLOUDABI:#define __INT64_FMTi__ "li"
 // X86_64-CLOUDABI:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-CLOUDABI:#define __INT64_TYPE__ long int
-// X86_64-CLOUDABI:#define __INT8_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __INT8_C_SUFFIX__
 // X86_64-CLOUDABI:#define __INT8_FMTd__ "hhd"
 // X86_64-CLOUDABI:#define __INT8_FMTi__ "hhi"
 // X86_64-CLOUDABI:#define __INT8_MAX__ 127
@@ -7714,7 +7760,7 @@
 // X86_64-CLOUDABI:#define __PTRDIFF_MAX__ 9223372036854775807L
 // X86_64-CLOUDABI:#define __PTRDIFF_TYPE__ long int
 // X86_64-CLOUDABI:#define __PTRDIFF_WIDTH__ 64
-// X86_64-CLOUDABI:#define __REGISTER_PREFIX__ 
+// X86_64-CLOUDABI:#define __REGISTER_PREFIX__
 // X86_64-CLOUDABI:#define __SCHAR_MAX__ 127
 // X86_64-CLOUDABI:#define __SHRT_MAX__ 32767
 // X86_64-CLOUDABI:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -7749,7 +7795,7 @@
 // X86_64-CLOUDABI:#define __STDC_UTF_32__ 1
 // X86_64-CLOUDABI:#define __STDC_VERSION__ 201112L
 // X86_64-CLOUDABI:#define __STDC__ 1
-// X86_64-CLOUDABI:#define __UINT16_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __UINT16_C_SUFFIX__
 // X86_64-CLOUDABI:#define __UINT16_FMTX__ "hX"
 // X86_64-CLOUDABI:#define __UINT16_FMTo__ "ho"
 // X86_64-CLOUDABI:#define __UINT16_FMTu__ "hu"
@@ -7770,7 +7816,7 @@
 // X86_64-CLOUDABI:#define __UINT64_FMTx__ "lx"
 // X86_64-CLOUDABI:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-CLOUDABI:#define __UINT64_TYPE__ long unsigned int
-// X86_64-CLOUDABI:#define __UINT8_C_SUFFIX__ 
+// X86_64-CLOUDABI:#define __UINT8_C_SUFFIX__
 // X86_64-CLOUDABI:#define __UINT8_FMTX__ "hhX"
 // X86_64-CLOUDABI:#define __UINT8_FMTo__ "hho"
 // X86_64-CLOUDABI:#define __UINT8_FMTu__ "hhu"
@@ -7840,8 +7886,8 @@
 // X86_64-CLOUDABI:#define __UINT_LEAST8_FMTx__ "hhx"
 // X86_64-CLOUDABI:#define __UINT_LEAST8_MAX__ 255
 // X86_64-CLOUDABI:#define __UINT_LEAST8_TYPE__ unsigned char
-// X86_64-CLOUDABI:#define __USER_LABEL_PREFIX__ 
-// X86_64-CLOUDABI:#define __VERSION__ "4.2.1 Compatible
+// X86_64-CLOUDABI:#define __USER_LABEL_PREFIX__
+// X86_64-CLOUDABI:#define __VERSION__ "4.2.1 Compatible{{.*}}
 // X86_64-CLOUDABI:#define __WCHAR_MAX__ 2147483647
 // X86_64-CLOUDABI:#define __WCHAR_TYPE__ int
 // X86_64-CLOUDABI:#define __WCHAR_WIDTH__ 32
@@ -7850,15 +7896,15 @@
 // X86_64-CLOUDABI:#define __amd64 1
 // X86_64-CLOUDABI:#define __amd64__ 1
 // X86_64-CLOUDABI:#define __clang__ 1
-// X86_64-CLOUDABI:#define __clang_major__ 
-// X86_64-CLOUDABI:#define __clang_minor__ 
-// X86_64-CLOUDABI:#define __clang_patchlevel__ 
-// X86_64-CLOUDABI:#define __clang_version__ 
+// X86_64-CLOUDABI:#define __clang_major__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_minor__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_patchlevel__ {{.*}}
+// X86_64-CLOUDABI:#define __clang_version__ {{.*}}
 // X86_64-CLOUDABI:#define __llvm__ 1
 // X86_64-CLOUDABI:#define __x86_64 1
 // X86_64-CLOUDABI:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -check-prefix X86_64-LINUX %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-LINUX %s
 //
 // X86_64-LINUX:#define _LP64 1
 // X86_64-LINUX:#define __BIGGEST_ALIGNMENT__ 16
@@ -7895,12 +7941,12 @@
 // X86_64-LINUX:#define __FLT_MIN_EXP__ (-125)
 // X86_64-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64-LINUX:#define __FLT_RADIX__ 2
-// X86_64-LINUX:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT16_C_SUFFIX__
 // X86_64-LINUX:#define __INT16_FMTd__ "hd"
 // X86_64-LINUX:#define __INT16_FMTi__ "hi"
 // X86_64-LINUX:#define __INT16_MAX__ 32767
 // X86_64-LINUX:#define __INT16_TYPE__ short
-// X86_64-LINUX:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT32_C_SUFFIX__
 // X86_64-LINUX:#define __INT32_FMTd__ "d"
 // X86_64-LINUX:#define __INT32_FMTi__ "i"
 // X86_64-LINUX:#define __INT32_MAX__ 2147483647
@@ -7910,7 +7956,7 @@
 // X86_64-LINUX:#define __INT64_FMTi__ "li"
 // X86_64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-LINUX:#define __INT64_TYPE__ long int
-// X86_64-LINUX:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __INT8_C_SUFFIX__
 // X86_64-LINUX:#define __INT8_FMTd__ "hhd"
 // X86_64-LINUX:#define __INT8_FMTi__ "hhi"
 // X86_64-LINUX:#define __INT8_MAX__ 127
@@ -7981,7 +8027,7 @@
 // X86_64-LINUX:#define __POINTER_WIDTH__ 64
 // X86_64-LINUX:#define __PTRDIFF_TYPE__ long int
 // X86_64-LINUX:#define __PTRDIFF_WIDTH__ 64
-// X86_64-LINUX:#define __REGISTER_PREFIX__ 
+// X86_64-LINUX:#define __REGISTER_PREFIX__
 // X86_64-LINUX:#define __SCHAR_MAX__ 127
 // X86_64-LINUX:#define __SHRT_MAX__ 32767
 // X86_64-LINUX:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8005,7 +8051,7 @@
 // X86_64-LINUX:#define __SSE2__ 1
 // X86_64-LINUX:#define __SSE_MATH__ 1
 // X86_64-LINUX:#define __SSE__ 1
-// X86_64-LINUX:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __UINT16_C_SUFFIX__
 // X86_64-LINUX:#define __UINT16_MAX__ 65535
 // X86_64-LINUX:#define __UINT16_TYPE__ unsigned short
 // X86_64-LINUX:#define __UINT32_C_SUFFIX__ U
@@ -8014,7 +8060,7 @@
 // X86_64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // X86_64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-LINUX:#define __UINT64_TYPE__ long unsigned int
-// X86_64-LINUX:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64-LINUX:#define __UINT8_C_SUFFIX__
 // X86_64-LINUX:#define __UINT8_MAX__ 255
 // X86_64-LINUX:#define __UINT8_TYPE__ unsigned char
 // X86_64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
@@ -8051,7 +8097,7 @@
 // X86_64-LINUX:#define __x86_64 1
 // X86_64-LINUX:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-freebsd9.1 < /dev/null | FileCheck -check-prefix X86_64-FREEBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-unknown-freebsd9.1 < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-FREEBSD %s
 //
 // X86_64-FREEBSD:#define __DBL_DECIMAL_DIG__ 17
 // X86_64-FREEBSD:#define __FLT_DECIMAL_DIG__ 9
@@ -8060,7 +8106,7 @@
 // X86_64-FREEBSD:#define __LDBL_DECIMAL_DIG__ 21
 // X86_64-FREEBSD:#define __STDC_MB_MIGHT_NEQ_WC__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-netbsd < /dev/null | FileCheck -check-prefix X86_64-NETBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-netbsd < /dev/null | FileCheck -match-full-lines -check-prefix X86_64-NETBSD %s
 //
 // X86_64-NETBSD:#define _LP64 1
 // X86_64-NETBSD:#define __BIGGEST_ALIGNMENT__ 16
@@ -8097,12 +8143,12 @@
 // X86_64-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // X86_64-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64-NETBSD:#define __FLT_RADIX__ 2
-// X86_64-NETBSD:#define __INT16_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT16_C_SUFFIX__
 // X86_64-NETBSD:#define __INT16_FMTd__ "hd"
 // X86_64-NETBSD:#define __INT16_FMTi__ "hi"
 // X86_64-NETBSD:#define __INT16_MAX__ 32767
 // X86_64-NETBSD:#define __INT16_TYPE__ short
-// X86_64-NETBSD:#define __INT32_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT32_C_SUFFIX__
 // X86_64-NETBSD:#define __INT32_FMTd__ "d"
 // X86_64-NETBSD:#define __INT32_FMTi__ "i"
 // X86_64-NETBSD:#define __INT32_MAX__ 2147483647
@@ -8112,7 +8158,7 @@
 // X86_64-NETBSD:#define __INT64_FMTi__ "li"
 // X86_64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-NETBSD:#define __INT64_TYPE__ long int
-// X86_64-NETBSD:#define __INT8_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __INT8_C_SUFFIX__
 // X86_64-NETBSD:#define __INT8_FMTd__ "hhd"
 // X86_64-NETBSD:#define __INT8_FMTi__ "hhi"
 // X86_64-NETBSD:#define __INT8_MAX__ 127
@@ -8183,7 +8229,7 @@
 // X86_64-NETBSD:#define __POINTER_WIDTH__ 64
 // X86_64-NETBSD:#define __PTRDIFF_TYPE__ long int
 // X86_64-NETBSD:#define __PTRDIFF_WIDTH__ 64
-// X86_64-NETBSD:#define __REGISTER_PREFIX__ 
+// X86_64-NETBSD:#define __REGISTER_PREFIX__
 // X86_64-NETBSD:#define __SCHAR_MAX__ 127
 // X86_64-NETBSD:#define __SHRT_MAX__ 32767
 // X86_64-NETBSD:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8207,7 +8253,7 @@
 // X86_64-NETBSD:#define __SSE2__ 1
 // X86_64-NETBSD:#define __SSE_MATH__ 1
 // X86_64-NETBSD:#define __SSE__ 1
-// X86_64-NETBSD:#define __UINT16_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __UINT16_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT16_MAX__ 65535
 // X86_64-NETBSD:#define __UINT16_TYPE__ unsigned short
 // X86_64-NETBSD:#define __UINT32_C_SUFFIX__ U
@@ -8216,7 +8262,7 @@
 // X86_64-NETBSD:#define __UINT64_C_SUFFIX__ UL
 // X86_64-NETBSD:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-NETBSD:#define __UINT64_TYPE__ long unsigned int
-// X86_64-NETBSD:#define __UINT8_C_SUFFIX__ {{$}}
+// X86_64-NETBSD:#define __UINT8_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT8_MAX__ 255
 // X86_64-NETBSD:#define __UINT8_TYPE__ unsigned char
 // X86_64-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
@@ -8253,7 +8299,7 @@
 // X86_64-NETBSD:#define __x86_64 1
 // X86_64-NETBSD:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-scei-ps4 < /dev/null | FileCheck -check-prefix PS4 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-scei-ps4 < /dev/null | FileCheck -match-full-lines -check-prefix PS4 %s
 //
 // PS4:#define _LP64 1
 // PS4:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
@@ -8323,12 +8369,12 @@
 // PS4:#define __LP64__ 1
 // PS4:#define __MMX__ 1
 // PS4:#define __NO_MATH_INLINES 1
+// PS4:#define __ORBIS__ 1
 // PS4:#define __POINTER_WIDTH__ 64
-// PS4:#define __PS4__ 1
 // PS4:#define __PTRDIFF_MAX__ 9223372036854775807L
 // PS4:#define __PTRDIFF_TYPE__ long int
 // PS4:#define __PTRDIFF_WIDTH__ 64
-// PS4:#define __REGISTER_PREFIX__ 
+// PS4:#define __REGISTER_PREFIX__
 // PS4:#define __SCHAR_MAX__ 127
 // PS4:#define __SHRT_MAX__ 32767
 // PS4:#define __SIG_ATOMIC_MAX__ 2147483647
@@ -8351,6 +8397,7 @@
 // PS4:#define __SSE2__ 1
 // PS4:#define __SSE_MATH__ 1
 // PS4:#define __SSE__ 1
+// PS4:#define __STDC_VERSION__ 199901L
 // PS4:#define __UINTMAX_TYPE__ long unsigned int
 // PS4:#define __USER_LABEL_PREFIX__
 // PS4:#define __WCHAR_MAX__ 65535
@@ -8366,11 +8413,11 @@
 // PS4:#define __x86_64 1
 // PS4:#define __x86_64__ 1
 //
-// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-mingw32 < /dev/null | FileCheck -check-prefix X86-64-DECLSPEC %s
-// RUN: %clang_cc1 -E -dM -fms-extensions -triple=x86_64-unknown-mingw32 < /dev/null | FileCheck -check-prefix X86-64-DECLSPEC %s
-// X86-64-DECLSPEC: #define __declspec
+// RUN: %clang_cc1 -E -dM -triple=x86_64-pc-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix X86-64-DECLSPEC %s
+// RUN: %clang_cc1 -E -dM -fms-extensions -triple=x86_64-unknown-mingw32 < /dev/null | FileCheck -match-full-lines -check-prefix X86-64-DECLSPEC %s
+// X86-64-DECLSPEC: #define __declspec{{.*}}
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -check-prefix SPARCV9 %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARCV9 %s
 // SPARCV9:#define __INT64_TYPE__ long int
 // SPARCV9:#define __INTMAX_C_SUFFIX__ L
 // SPARCV9:#define __INTMAX_TYPE__ long int
@@ -8381,668 +8428,676 @@
 // SPARCV9:#define __SIZEOF_POINTER__ 8
 // SPARCV9:#define __UINTPTR_TYPE__ long unsigned int
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -check-prefix SPARC64-OBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC64-OBSD %s
 // SPARC64-OBSD:#define __INT64_TYPE__ long long int
 // SPARC64-OBSD:#define __INTMAX_C_SUFFIX__ LL
 // SPARC64-OBSD:#define __INTMAX_TYPE__ long long int
 // SPARC64-OBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC64-OBSD:#define __UINTMAX_TYPE__ long long unsigned int
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-kfreebsd-gnu < /dev/null | FileCheck -check-prefix KFREEBSD-DEFINE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=x86_64-pc-kfreebsd-gnu < /dev/null | FileCheck -match-full-lines -check-prefix KFREEBSD-DEFINE %s
 // KFREEBSD-DEFINE:#define __FreeBSD_kernel__ 1
 // KFREEBSD-DEFINE:#define __GLIBC__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i686-pc-kfreebsd-gnu < /dev/null | FileCheck -check-prefix KFREEBSDI686-DEFINE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=i686-pc-kfreebsd-gnu < /dev/null | FileCheck -match-full-lines -check-prefix KFREEBSDI686-DEFINE %s
 // KFREEBSDI686-DEFINE:#define __FreeBSD_kernel__ 1
 // KFREEBSDI686-DEFINE:#define __GLIBC__ 1
 //
-// RUN: %clang_cc1 -x c++ -triple i686-pc-linux-gnu -fobjc-runtime=gcc -E -dM < /dev/null | FileCheck -check-prefix GNUSOURCE %s
+// RUN: %clang_cc1 -x c++ -triple i686-pc-linux-gnu -fobjc-runtime=gcc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix GNUSOURCE %s
 // GNUSOURCE:#define _GNU_SOURCE 1
 //
-// RUN: %clang_cc1 -x c++ -std=c++98 -fno-rtti -E -dM < /dev/null | FileCheck -check-prefix NORTTI %s
-// NORTTI: __GXX_ABI_VERSION
+// RUN: %clang_cc1 -x c++ -std=c++98 -fno-rtti -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix NORTTI %s
+// NORTTI: #define __GXX_ABI_VERSION {{.*}}
 // NORTTI-NOT:#define __GXX_RTTI
-// NORTTI: __STDC__
+// NORTTI:#define __STDC__ 1
+//
+// RUN: %clang_cc1 -triple arm-linux-androideabi -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix ANDROID %s
+// ANDROID:#define __ANDROID__ 1
 //
-// RUN: %clang_cc1 -triple arm-linux-androideabi -E -dM < /dev/null | FileCheck -check-prefix ANDROID %s
-// ANDROID: __ANDROID__ 1
+// RUN: %clang_cc1 -triple lanai-unknown-unknown -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix LANAI %s
+// LANAI: #define __lanai__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -check-prefix PPC64-FREEBSD %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc64-unknown-freebsd < /dev/null | FileCheck -match-full-lines -check-prefix PPC64-FREEBSD %s
 // PPC64-FREEBSD-NOT: #define __LONG_DOUBLE_128__ 1
 //
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -check-prefix XCORE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=xcore-none-none < /dev/null | FileCheck -match-full-lines -check-prefix XCORE %s
 // XCORE:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
 // XCORE:#define __LITTLE_ENDIAN__ 1
 // XCORE:#define __XS1B__ 1
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=wasm32-unknown-unknown \
 // RUN:   < /dev/null \
-// RUN:   | FileCheck -check-prefix=WEBASSEMBLY32 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix=WEBASSEMBLY32 %s
 //
-// WEBASSEMBLY32:#define _ILP32 1{{$}}
+// WEBASSEMBLY32:#define _ILP32 1
 // WEBASSEMBLY32-NOT:#define _LP64
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQUIRE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQ_REL 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_CONSUME 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELAXED 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELEASE 3{{$}}
-// WEBASSEMBLY32-NEXT:#define __ATOMIC_SEQ_CST 5{{$}}
-// WEBASSEMBLY32-NEXT:#define __BIGGEST_ALIGNMENT__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __CHAR_BIT__ 8{{$}}
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQUIRE 2
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_ACQ_REL 4
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_CONSUME 1
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELAXED 0
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_RELEASE 3
+// WEBASSEMBLY32-NEXT:#define __ATOMIC_SEQ_CST 5
+// WEBASSEMBLY32-NEXT:#define __BIGGEST_ALIGNMENT__ 16
+// WEBASSEMBLY32-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// WEBASSEMBLY32-NEXT:#define __CHAR16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __CHAR32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __CHAR_BIT__ 8
 // WEBASSEMBLY32-NOT:#define __CHAR_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __CONSTANT_CFSTRINGS__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DECIMAL_DIG__ 17{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_DIG__ 15{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MANT_DIG__ 53{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX_10_EXP__ 308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX_EXP__ 1024{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN_10_EXP__ (-307){{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN_EXP__ (-1021){{$}}
-// WEBASSEMBLY32-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308{{$}}
-// WEBASSEMBLY32-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__{{$}}
+// WEBASSEMBLY32-NEXT:#define __CONSTANT_CFSTRINGS__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_DECIMAL_DIG__ 17
+// WEBASSEMBLY32-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// WEBASSEMBLY32-NEXT:#define __DBL_DIG__ 15
+// WEBASSEMBLY32-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __DBL_MANT_DIG__ 53
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX_10_EXP__ 308
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX_EXP__ 1024
+// WEBASSEMBLY32-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN_10_EXP__ (-307)
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN_EXP__ (-1021)
+// WEBASSEMBLY32-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308
+// WEBASSEMBLY32-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
 // WEBASSEMBLY32-NOT:#define __ELF__
-// WEBASSEMBLY32-NEXT:#define __FINITE_MATH_ONLY__ 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DECIMAL_DIG__ 9{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_DIG__ 6{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_EVAL_METHOD__ 0{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MANT_DIG__ 24{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX_10_EXP__ 38{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX_EXP__ 128{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MAX__ 3.40282347e+38F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN_10_EXP__ (-37){{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN_EXP__ (-125){{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_MIN__ 1.17549435e-38F{{$}}
-// WEBASSEMBLY32-NEXT:#define __FLT_RADIX__ 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_MINOR__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_PATCHLEVEL__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GNUC_STDC_INLINE__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __GNUC__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __GXX_ABI_VERSION 1002{{$}}
-// WEBASSEMBLY32-NEXT:#define __GXX_RTTI 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ILP32__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTd__ "ld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTi__ "li"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_TYPE__ long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INTPTR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_FAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_TYPE__ short{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_TYPE__ int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY32-NEXT:#define __INT_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DECIMAL_DIG__ 36{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_DIG__ 33{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MANT_DIG__ 113{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_10_EXP__ 4932{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_EXP__ 16384{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_10_EXP__ (-4931){{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_EXP__ (-16381){{$}}
-// WEBASSEMBLY32-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L{{$}}
-// WEBASSEMBLY32-NEXT:#define __LITTLE_ENDIAN__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY32-NEXT:#define __LONG_MAX__ 2147483647L{{$}}
+// WEBASSEMBLY32-NEXT:#define __FINITE_MATH_ONLY__ 0
+// WEBASSEMBLY32-NEXT:#define __FLT_DECIMAL_DIG__ 9
+// WEBASSEMBLY32-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// WEBASSEMBLY32-NEXT:#define __FLT_DIG__ 6
+// WEBASSEMBLY32-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
+// WEBASSEMBLY32-NEXT:#define __FLT_EVAL_METHOD__ 0
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __FLT_MANT_DIG__ 24
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX_10_EXP__ 38
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX_EXP__ 128
+// WEBASSEMBLY32-NEXT:#define __FLT_MAX__ 3.40282347e+38F
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN_10_EXP__ (-37)
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN_EXP__ (-125)
+// WEBASSEMBLY32-NEXT:#define __FLT_MIN__ 1.17549435e-38F
+// WEBASSEMBLY32-NEXT:#define __FLT_RADIX__ 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// WEBASSEMBLY32-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY32-NEXT:#define __GNUC_MINOR__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GNUC_PATCHLEVEL__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GNUC_STDC_INLINE__ 1
+// WEBASSEMBLY32-NEXT:#define __GNUC__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __GXX_ABI_VERSION 1002
+// WEBASSEMBLY32-NEXT:#define __ILP32__ 1
+// WEBASSEMBLY32-NEXT:#define __INT16_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT32_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT64_C_SUFFIX__ LL
+// WEBASSEMBLY32-NEXT:#define __INT64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT8_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __INT8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INTMAX_C_SUFFIX__ LL
+// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INTMAX_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INTMAX_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INTMAX_WIDTH__ 64
+// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTd__ "ld"
+// WEBASSEMBLY32-NEXT:#define __INTPTR_FMTi__ "li"
+// WEBASSEMBLY32-NEXT:#define __INTPTR_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __INTPTR_TYPE__ long int
+// WEBASSEMBLY32-NEXT:#define __INTPTR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT_FAST16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT_FAST32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT_FAST64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT_FAST8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTd__ "hd"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_FMTi__ "hi"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST16_TYPE__ short
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTd__ "d"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_FMTi__ "i"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST32_TYPE__ int
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTd__ "lld"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_FMTi__ "lli"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST64_TYPE__ long long int
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTd__ "hhd"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_FMTi__ "hhi"
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __INT_LEAST8_TYPE__ signed char
+// WEBASSEMBLY32-NEXT:#define __INT_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __LDBL_DECIMAL_DIG__ 36
+// WEBASSEMBLY32-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// WEBASSEMBLY32-NEXT:#define __LDBL_DIG__ 33
+// WEBASSEMBLY32-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_DENORM__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_INFINITY__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY32-NEXT:#define __LDBL_MANT_DIG__ 113
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_10_EXP__ 4932
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX_EXP__ 16384
+// WEBASSEMBLY32-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_10_EXP__ (-4931)
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN_EXP__ (-16381)
+// WEBASSEMBLY32-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// WEBASSEMBLY32-NEXT:#define __LITTLE_ENDIAN__ 1
+// WEBASSEMBLY32-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// WEBASSEMBLY32-NEXT:#define __LONG_MAX__ 2147483647L
 // WEBASSEMBLY32-NOT:#define __LP64__
-// WEBASSEMBLY32-NEXT:#define __NO_INLINE__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_BIG_ENDIAN__ 4321{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234{{$}}
-// WEBASSEMBLY32-NEXT:#define __ORDER_PDP_ENDIAN__ 3412{{$}}
-// WEBASSEMBLY32-NEXT:#define __POINTER_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTd__ "ld"{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTi__ "li"{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_TYPE__ long int{{$}}
-// WEBASSEMBLY32-NEXT:#define __PTRDIFF_WIDTH__ 32{{$}}
+// WEBASSEMBLY32-NEXT:#define __NO_INLINE__ 1
+// WEBASSEMBLY32-NEXT:#define __ORDER_BIG_ENDIAN__ 4321
+// WEBASSEMBLY32-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234
+// WEBASSEMBLY32-NEXT:#define __ORDER_PDP_ENDIAN__ 3412
+// WEBASSEMBLY32-NEXT:#define __POINTER_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTd__ "ld"
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_FMTi__ "li"
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_TYPE__ long int
+// WEBASSEMBLY32-NEXT:#define __PTRDIFF_WIDTH__ 32
 // WEBASSEMBLY32-NOT:#define __REGISTER_PREFIX__
-// WEBASSEMBLY32-NEXT:#define __SCHAR_MAX__ 127{{$}}
-// WEBASSEMBLY32-NEXT:#define __SHRT_MAX__ 32767{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_MAX__ 2147483647L{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_DOUBLE__ 8{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_FLOAT__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT128__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_LONG__ 8{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_POINTER__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_PTRDIFF_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_SHORT__ 2{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_SIZE_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_WCHAR_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZEOF_WINT_T__ 4{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTX__ "lX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTo__ "lo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTu__ "lu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_FMTx__ "lx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_MAX__ 4294967295UL{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __SIZE_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_HOSTED__ 0{{$}}
+// WEBASSEMBLY32-NEXT:#define __SCHAR_MAX__ 127
+// WEBASSEMBLY32-NEXT:#define __SHRT_MAX__ 32767
+// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_MAX__ 2147483647L
+// WEBASSEMBLY32-NEXT:#define __SIG_ATOMIC_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_DOUBLE__ 8
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_FLOAT__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_INT__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG_LONG__ 8
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_LONG__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_POINTER__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_PTRDIFF_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_SHORT__ 2
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_SIZE_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_WCHAR_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZEOF_WINT_T__ 4
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTX__ "lX"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTo__ "lo"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTu__ "lu"
+// WEBASSEMBLY32-NEXT:#define __SIZE_FMTx__ "lx"
+// WEBASSEMBLY32-NEXT:#define __SIZE_MAX__ 4294967295UL
+// WEBASSEMBLY32-NEXT:#define __SIZE_TYPE__ long unsigned int
+// WEBASSEMBLY32-NEXT:#define __SIZE_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __STDC_HOSTED__ 0
 // WEBASSEMBLY32-NOT:#define __STDC_MB_MIGHT_NEQ_WC__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_ATOMICS__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_COMPLEX__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_VLA__
 // WEBASSEMBLY32-NOT:#define __STDC_NO_THREADS__
-// WEBASSEMBLY32-NEXT:#define __STDC_UTF_16__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_UTF_32__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC_VERSION__ 201112L{{$}}
-// WEBASSEMBLY32-NEXT:#define __STDC__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_C_SUFFIX__ U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTX__ "lX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTo__ "lo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTu__ "lu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTx__ "lx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_MAX__ 4294967295UL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINTPTR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_MAX__ 255{{$}}
-// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY32-NEXT:#define __USER_LABEL_PREFIX__ {{$}}
-// WEBASSEMBLY32-NEXT:#define __VERSION__ "{{.*}}"{{$}}
-// WEBASSEMBLY32-NEXT:#define __WCHAR_MAX__ 2147483647{{$}}
-// WEBASSEMBLY32-NEXT:#define __WCHAR_TYPE__ int{{$}}
+// WEBASSEMBLY32-NEXT:#define __STDC_UTF_16__ 1
+// WEBASSEMBLY32-NEXT:#define __STDC_UTF_32__ 1
+// WEBASSEMBLY32-NEXT:#define __STDC_VERSION__ 201112L
+// WEBASSEMBLY32-NEXT:#define __STDC__ 1
+// WEBASSEMBLY32-NEXT:#define __UINT16_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT32_C_SUFFIX__ U
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT64_C_SUFFIX__ ULL
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT8_C_SUFFIX__
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_C_SUFFIX__ ULL
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINTMAX_WIDTH__ 64
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTX__ "lX"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTo__ "lo"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTu__ "lu"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_FMTx__ "lx"
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_MAX__ 4294967295UL
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_TYPE__ long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINTPTR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT_FAST8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTX__ "hX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTo__ "ho"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTu__ "hu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_FMTx__ "hx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_MAX__ 65535
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTX__ "X"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTo__ "o"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTu__ "u"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_FMTx__ "x"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTX__ "llX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTo__ "llo"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTu__ "llu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_FMTx__ "llx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTo__ "hho"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_MAX__ 255
+// WEBASSEMBLY32-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char
+// WEBASSEMBLY32-NEXT:#define __USER_LABEL_PREFIX__
+// WEBASSEMBLY32-NEXT:#define __VERSION__ "{{.*}}"
+// WEBASSEMBLY32-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY32-NEXT:#define __WCHAR_TYPE__ int
 // WEBASSEMBLY32-NOT:#define __WCHAR_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __WCHAR_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __WINT_TYPE__ int{{$}}
+// WEBASSEMBLY32-NEXT:#define __WCHAR_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __WINT_TYPE__ int
 // WEBASSEMBLY32-NOT:#define __WINT_UNSIGNED__
-// WEBASSEMBLY32-NEXT:#define __WINT_WIDTH__ 32{{$}}
-// WEBASSEMBLY32-NEXT:#define __clang__ 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __clang_major__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_minor__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_patchlevel__ {{.}}
-// WEBASSEMBLY32-NEXT:#define __clang_version__ "{{.*}}"{{$}}
-// WEBASSEMBLY32-NEXT:#define __llvm__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __WINT_WIDTH__ 32
+// WEBASSEMBLY32-NEXT:#define __clang__ 1
+// WEBASSEMBLY32-NEXT:#define __clang_major__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_minor__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_patchlevel__ {{.*}}
+// WEBASSEMBLY32-NEXT:#define __clang_version__ "{{.*}}"
+// WEBASSEMBLY32-NEXT:#define __llvm__ 1
 // WEBASSEMBLY32-NOT:#define __wasm_simd128__
 // WEBASSEMBLY32-NOT:#define __wasm_simd256__
 // WEBASSEMBLY32-NOT:#define __wasm_simd512__
 // WEBASSEMBLY32-NOT:#define __unix
 // WEBASSEMBLY32-NOT:#define __unix__
-// WEBASSEMBLY32-NEXT:#define __wasm 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __wasm32 1{{$}}
-// WEBASSEMBLY32-NEXT:#define __wasm32__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __wasm 1
+// WEBASSEMBLY32-NEXT:#define __wasm32 1
+// WEBASSEMBLY32-NEXT:#define __wasm32__ 1
 // WEBASSEMBLY32-NOT:#define __wasm64
 // WEBASSEMBLY32-NOT:#define __wasm64__
-// WEBASSEMBLY32-NEXT:#define __wasm__ 1{{$}}
+// WEBASSEMBLY32-NEXT:#define __wasm__ 1
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=wasm64-unknown-unknown \
 // RUN:   < /dev/null \
-// RUN:   | FileCheck -check-prefix=WEBASSEMBLY64 %s
+// RUN:   | FileCheck -match-full-lines -check-prefix=WEBASSEMBLY64 %s
 //
 // WEBASSEMBLY64-NOT:#define _ILP32
-// WEBASSEMBLY64:#define _LP64 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQUIRE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQ_REL 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_CONSUME 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELAXED 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELEASE 3{{$}}
-// WEBASSEMBLY64-NEXT:#define __ATOMIC_SEQ_CST 5{{$}}
-// WEBASSEMBLY64-NEXT:#define __BIGGEST_ALIGNMENT__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __CHAR_BIT__ 8{{$}}
+// WEBASSEMBLY64:#define _LP64 1
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQUIRE 2
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_ACQ_REL 4
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_CONSUME 1
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELAXED 0
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_RELEASE 3
+// WEBASSEMBLY64-NEXT:#define __ATOMIC_SEQ_CST 5
+// WEBASSEMBLY64-NEXT:#define __BIGGEST_ALIGNMENT__ 16
+// WEBASSEMBLY64-NEXT:#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// WEBASSEMBLY64-NEXT:#define __CHAR16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __CHAR32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __CHAR_BIT__ 8
 // WEBASSEMBLY64-NOT:#define __CHAR_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __CONSTANT_CFSTRINGS__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DECIMAL_DIG__ 17{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_DIG__ 15{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MANT_DIG__ 53{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX_10_EXP__ 308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX_EXP__ 1024{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN_10_EXP__ (-307){{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN_EXP__ (-1021){{$}}
-// WEBASSEMBLY64-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308{{$}}
-// WEBASSEMBLY64-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__{{$}}
+// WEBASSEMBLY64-NEXT:#define __CONSTANT_CFSTRINGS__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_DECIMAL_DIG__ 17
+// WEBASSEMBLY64-NEXT:#define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// WEBASSEMBLY64-NEXT:#define __DBL_DIG__ 15
+// WEBASSEMBLY64-NEXT:#define __DBL_EPSILON__ 2.2204460492503131e-16
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __DBL_MANT_DIG__ 53
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX_10_EXP__ 308
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX_EXP__ 1024
+// WEBASSEMBLY64-NEXT:#define __DBL_MAX__ 1.7976931348623157e+308
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN_10_EXP__ (-307)
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN_EXP__ (-1021)
+// WEBASSEMBLY64-NEXT:#define __DBL_MIN__ 2.2250738585072014e-308
+// WEBASSEMBLY64-NEXT:#define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
 // WEBASSEMBLY64-NOT:#define __ELF__
-// WEBASSEMBLY64-NEXT:#define __FINITE_MATH_ONLY__ 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DECIMAL_DIG__ 9{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_DIG__ 6{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_EVAL_METHOD__ 0{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MANT_DIG__ 24{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX_10_EXP__ 38{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX_EXP__ 128{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MAX__ 3.40282347e+38F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN_10_EXP__ (-37){{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN_EXP__ (-125){{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_MIN__ 1.17549435e-38F{{$}}
-// WEBASSEMBLY64-NEXT:#define __FLT_RADIX__ 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_MINOR__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_PATCHLEVEL__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GNUC_STDC_INLINE__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __FINITE_MATH_ONLY__ 0
+// WEBASSEMBLY64-NEXT:#define __FLT_DECIMAL_DIG__ 9
+// WEBASSEMBLY64-NEXT:#define __FLT_DENORM_MIN__ 1.40129846e-45F
+// WEBASSEMBLY64-NEXT:#define __FLT_DIG__ 6
+// WEBASSEMBLY64-NEXT:#define __FLT_EPSILON__ 1.19209290e-7F
+// WEBASSEMBLY64-NEXT:#define __FLT_EVAL_METHOD__ 0
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __FLT_MANT_DIG__ 24
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX_10_EXP__ 38
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX_EXP__ 128
+// WEBASSEMBLY64-NEXT:#define __FLT_MAX__ 3.40282347e+38F
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN_10_EXP__ (-37)
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN_EXP__ (-125)
+// WEBASSEMBLY64-NEXT:#define __FLT_MIN__ 1.17549435e-38F
+// WEBASSEMBLY64-NEXT:#define __FLT_RADIX__ 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_INT_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_LONG_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
+// WEBASSEMBLY64-NEXT:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// WEBASSEMBLY64-NEXT:#define __GNUC_MINOR__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __GNUC_PATCHLEVEL__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __GNUC_STDC_INLINE__ 1
 // WEBASSEMBLY64-NEXT:#define __GNUC__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __GXX_ABI_VERSION 1002{{$}}
-// WEBASSEMBLY64-NEXT:#define __GXX_RTTI 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __GXX_ABI_VERSION 1002
 // WEBASSEMBLY64-NOT:#define __ILP32__
-// WEBASSEMBLY64-NEXT:#define __INT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_C_SUFFIX__ LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTd__ "ld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTi__ "li"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_TYPE__ long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INTPTR_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_FAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTd__ "hd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTi__ "hi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_TYPE__ short{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTd__ "d"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTi__ "i"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_TYPE__ int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTd__ "lld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTi__ "lli"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_TYPE__ long long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTd__ "hhd"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTi__ "hhi"{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_TYPE__ signed char{{$}}
-// WEBASSEMBLY64-NEXT:#define __INT_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DECIMAL_DIG__ 36{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_DIG__ 33{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_DENORM__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_INFINITY__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MANT_DIG__ 113{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_10_EXP__ 4932{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_EXP__ 16384{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_10_EXP__ (-4931){{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_EXP__ (-16381){{$}}
-// WEBASSEMBLY64-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LITTLE_ENDIAN__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL{{$}}
-// WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __LP64__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __NO_INLINE__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_BIG_ENDIAN__ 4321{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234{{$}}
-// WEBASSEMBLY64-NEXT:#define __ORDER_PDP_ENDIAN__ 3412{{$}}
-// WEBASSEMBLY64-NEXT:#define __POINTER_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTd__ "ld"{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTi__ "li"{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_TYPE__ long int{{$}}
-// WEBASSEMBLY64-NEXT:#define __PTRDIFF_WIDTH__ 64{{$}}
+// WEBASSEMBLY64-NEXT:#define __INT16_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT32_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT64_C_SUFFIX__ LL
+// WEBASSEMBLY64-NEXT:#define __INT64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT8_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __INT8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INTMAX_C_SUFFIX__ LL
+// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INTMAX_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INTMAX_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INTMAX_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INTMAX_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTd__ "ld"
+// WEBASSEMBLY64-NEXT:#define __INTPTR_FMTi__ "li"
+// WEBASSEMBLY64-NEXT:#define __INTPTR_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __INTPTR_TYPE__ long int
+// WEBASSEMBLY64-NEXT:#define __INTPTR_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT_FAST16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT_FAST32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT_FAST64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT_FAST8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTd__ "hd"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_FMTi__ "hi"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST16_TYPE__ short
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTd__ "d"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_FMTi__ "i"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST32_TYPE__ int
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTd__ "lld"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_FMTi__ "lli"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST64_TYPE__ long long int
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTd__ "hhd"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_FMTi__ "hhi"
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __INT_LEAST8_TYPE__ signed char
+// WEBASSEMBLY64-NEXT:#define __INT_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __LDBL_DECIMAL_DIG__ 36
+// WEBASSEMBLY64-NEXT:#define __LDBL_DENORM_MIN__ 6.47517511943802511092443895822764655e-4966L
+// WEBASSEMBLY64-NEXT:#define __LDBL_DIG__ 33
+// WEBASSEMBLY64-NEXT:#define __LDBL_EPSILON__ 1.92592994438723585305597794258492732e-34L
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_DENORM__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_INFINITY__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_HAS_QUIET_NAN__ 1
+// WEBASSEMBLY64-NEXT:#define __LDBL_MANT_DIG__ 113
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_10_EXP__ 4932
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX_EXP__ 16384
+// WEBASSEMBLY64-NEXT:#define __LDBL_MAX__ 1.18973149535723176508575932662800702e+4932L
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_10_EXP__ (-4931)
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN_EXP__ (-16381)
+// WEBASSEMBLY64-NEXT:#define __LDBL_MIN__ 3.36210314311209350626267781732175260e-4932L
+// WEBASSEMBLY64-NEXT:#define __LITTLE_ENDIAN__ 1
+// WEBASSEMBLY64-NEXT:#define __LONG_LONG_MAX__ 9223372036854775807LL
+// WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __LP64__ 1
+// WEBASSEMBLY64-NEXT:#define __NO_INLINE__ 1
+// WEBASSEMBLY64-NEXT:#define __ORDER_BIG_ENDIAN__ 4321
+// WEBASSEMBLY64-NEXT:#define __ORDER_LITTLE_ENDIAN__ 1234
+// WEBASSEMBLY64-NEXT:#define __ORDER_PDP_ENDIAN__ 3412
+// WEBASSEMBLY64-NEXT:#define __POINTER_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __PRAGMA_REDEFINE_EXTNAME 1
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTd__ "ld"
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_FMTi__ "li"
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_TYPE__ long int
+// WEBASSEMBLY64-NEXT:#define __PTRDIFF_WIDTH__ 64
 // WEBASSEMBLY64-NOT:#define __REGISTER_PREFIX__
-// WEBASSEMBLY64-NEXT:#define __SCHAR_MAX__ 127{{$}}
-// WEBASSEMBLY64-NEXT:#define __SHRT_MAX__ 32767{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_MAX__ 9223372036854775807L{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_DOUBLE__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_FLOAT__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT128__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_LONG__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_POINTER__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_PTRDIFF_T__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_SHORT__ 2{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_SIZE_T__ 8{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_WCHAR_T__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZEOF_WINT_T__ 4{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTX__ "lX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTo__ "lo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTu__ "lu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_FMTx__ "lx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_MAX__ 18446744073709551615UL{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __SIZE_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_HOSTED__ 0{{$}}
+// WEBASSEMBLY64-NEXT:#define __SCHAR_MAX__ 127
+// WEBASSEMBLY64-NEXT:#define __SHRT_MAX__ 32767
+// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_MAX__ 9223372036854775807L
+// WEBASSEMBLY64-NEXT:#define __SIG_ATOMIC_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_DOUBLE__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_FLOAT__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT128__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_INT__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_DOUBLE__ 16
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG_LONG__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_LONG__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_POINTER__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_PTRDIFF_T__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_SHORT__ 2
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_SIZE_T__ 8
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_WCHAR_T__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZEOF_WINT_T__ 4
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTX__ "lX"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTo__ "lo"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTu__ "lu"
+// WEBASSEMBLY64-NEXT:#define __SIZE_FMTx__ "lx"
+// WEBASSEMBLY64-NEXT:#define __SIZE_MAX__ 18446744073709551615UL
+// WEBASSEMBLY64-NEXT:#define __SIZE_TYPE__ long unsigned int
+// WEBASSEMBLY64-NEXT:#define __SIZE_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __STDC_HOSTED__ 0
 // WEBASSEMBLY64-NOT:#define __STDC_MB_MIGHT_NEQ_WC__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_ATOMICS__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_COMPLEX__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_VLA__
 // WEBASSEMBLY64-NOT:#define __STDC_NO_THREADS__
-// WEBASSEMBLY64-NEXT:#define __STDC_UTF_16__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_UTF_32__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC_VERSION__ 201112L{{$}}
-// WEBASSEMBLY64-NEXT:#define __STDC__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_C_SUFFIX__ U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_C_SUFFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_C_SUFFIX__ ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTMAX_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTX__ "lX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTo__ "lo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTu__ "lu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTx__ "lx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_MAX__ 18446744073709551615UL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_TYPE__ long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINTPTR_WIDTH__ 64{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTX__ "hX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTo__ "ho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTu__ "hu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTx__ "hx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_MAX__ 65535{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTX__ "X"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTo__ "o"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTu__ "u"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTx__ "x"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTX__ "llX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTo__ "llo"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTu__ "llu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTx__ "llx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTo__ "hho"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_MAX__ 255{{$}}
-// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char{{$}}
-// WEBASSEMBLY64-NEXT:#define __USER_LABEL_PREFIX__ {{$}}
-// WEBASSEMBLY64-NEXT:#define __VERSION__ "{{.*}}"{{$}}
-// WEBASSEMBLY64-NEXT:#define __WCHAR_MAX__ 2147483647{{$}}
-// WEBASSEMBLY64-NEXT:#define __WCHAR_TYPE__ int{{$}}
+// WEBASSEMBLY64-NEXT:#define __STDC_UTF_16__ 1
+// WEBASSEMBLY64-NEXT:#define __STDC_UTF_32__ 1
+// WEBASSEMBLY64-NEXT:#define __STDC_VERSION__ 201112L
+// WEBASSEMBLY64-NEXT:#define __STDC__ 1
+// WEBASSEMBLY64-NEXT:#define __UINT16_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT32_C_SUFFIX__ U
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT64_C_SUFFIX__ ULL
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT8_C_SUFFIX__
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_C_SUFFIX__ ULL
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINTMAX_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTX__ "lX"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTo__ "lo"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTu__ "lu"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_FMTx__ "lx"
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_MAX__ 18446744073709551615UL
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_TYPE__ long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINTPTR_WIDTH__ 64
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT_FAST8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTX__ "hX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTo__ "ho"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTu__ "hu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_FMTx__ "hx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_MAX__ 65535
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST16_TYPE__ unsigned short
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTX__ "X"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTo__ "o"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTu__ "u"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_FMTx__ "x"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_MAX__ 4294967295U
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST32_TYPE__ unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTX__ "llX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTo__ "llo"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTu__ "llu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_FMTx__ "llx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST64_TYPE__ long long unsigned int
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTX__ "hhX"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTo__ "hho"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTu__ "hhu"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_FMTx__ "hhx"
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_MAX__ 255
+// WEBASSEMBLY64-NEXT:#define __UINT_LEAST8_TYPE__ unsigned char
+// WEBASSEMBLY64-NEXT:#define __USER_LABEL_PREFIX__
+// WEBASSEMBLY64-NEXT:#define __VERSION__ "{{.*}}"
+// WEBASSEMBLY64-NEXT:#define __WCHAR_MAX__ 2147483647
+// WEBASSEMBLY64-NEXT:#define __WCHAR_TYPE__ int
 // WEBASSEMBLY64-NOT:#define __WCHAR_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __WCHAR_WIDTH__ 32{{$}}
-// WEBASSEMBLY64-NEXT:#define __WINT_TYPE__ int{{$}}
+// WEBASSEMBLY64-NEXT:#define __WCHAR_WIDTH__ 32
+// WEBASSEMBLY64-NEXT:#define __WINT_TYPE__ int
 // WEBASSEMBLY64-NOT:#define __WINT_UNSIGNED__
-// WEBASSEMBLY64-NEXT:#define __WINT_WIDTH__ 32{{$}}
-// WEBASSEMBLY64-NEXT:#define __clang__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __clang_major__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_minor__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_patchlevel__ {{.}}
-// WEBASSEMBLY64-NEXT:#define __clang_version__ "{{.*}}"{{$}}
-// WEBASSEMBLY64-NEXT:#define __llvm__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __WINT_WIDTH__ 32
+// WEBASSEMBLY64-NEXT:#define __clang__ 1
+// WEBASSEMBLY64-NEXT:#define __clang_major__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_minor__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_patchlevel__ {{.*}}
+// WEBASSEMBLY64-NEXT:#define __clang_version__ "{{.*}}"
+// WEBASSEMBLY64-NEXT:#define __llvm__ 1
 // WEBASSEMBLY64-NOT:#define __wasm_simd128__
 // WEBASSEMBLY64-NOT:#define __wasm_simd256__
 // WEBASSEMBLY64-NOT:#define __wasm_simd512__
 // WEBASSEMBLY64-NOT:#define __unix
 // WEBASSEMBLY64-NOT:#define __unix__
-// WEBASSEMBLY64-NEXT:#define __wasm 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __wasm 1
 // WEBASSEMBLY64-NOT:#define __wasm32
 // WEBASSEMBLY64-NOT:#define __wasm32__
-// WEBASSEMBLY64-NEXT:#define __wasm64 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __wasm64__ 1{{$}}
-// WEBASSEMBLY64-NEXT:#define __wasm__ 1{{$}}
+// WEBASSEMBLY64-NEXT:#define __wasm64 1
+// WEBASSEMBLY64-NEXT:#define __wasm64__ 1
+// WEBASSEMBLY64-NEXT:#define __wasm__ 1
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple i686-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X32 %s
+// CYGWIN-X32: #define __USER_LABEL_PREFIX__ _
+
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple x86_64-windows-cygnus < /dev/null | FileCheck -match-full-lines -check-prefix CYGWIN-X64 %s
+// CYGWIN-X64: #define __USER_LABEL_PREFIX__
+
diff --git a/test/Preprocessor/invalid-__has_warning1.c b/test/Preprocessor/invalid-__has_warning1.c
index b6a0b2e8ee3c9..5e4f12f5699d9 100644
--- a/test/Preprocessor/invalid-__has_warning1.c
+++ b/test/Preprocessor/invalid-__has_warning1.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -verify %s
 
 // These must be the last lines in this test.
-// expected-error@+1{{expected string literal}} expected-error@+1 2{{expected}}
+// expected-error@+1{{unterminated}} expected-error@+1 2{{expected}}
 int i = __has_warning(
diff --git a/test/Preprocessor/invalid-__has_warning2.c b/test/Preprocessor/invalid-__has_warning2.c
index 8aba530c875a9..f54ff47993190 100644
--- a/test/Preprocessor/invalid-__has_warning2.c
+++ b/test/Preprocessor/invalid-__has_warning2.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -verify %s
 
 // These must be the last lines in this test.
-// expected-error@+1{{expected string literal}} expected-error@+1{{expected}}
+// expected-error@+1{{too few arguments}}
 int i = __has_warning();
diff --git a/test/Preprocessor/macro_expand.c b/test/Preprocessor/macro_expand.c
index cf98a2cbfb8fc..430068ba7295d 100644
--- a/test/Preprocessor/macro_expand.c
+++ b/test/Preprocessor/macro_expand.c
@@ -19,3 +19,9 @@ C: for(for))
 // rdar://6880648
 #define f(x,y...) y
 f()
+
+// CHECK: #pragma omp parallel for
+#define FOO parallel
+#define Streaming _Pragma("omp FOO for")
+Streaming
+
diff --git a/test/Preprocessor/microsoft-ext.c b/test/Preprocessor/microsoft-ext.c
index b03f6775429a3..cb3cf4f153798 100644
--- a/test/Preprocessor/microsoft-ext.c
+++ b/test/Preprocessor/microsoft-ext.c
@@ -34,3 +34,12 @@ ACTION_TEMPLATE(InvokeArgument,
 
 MAKE_FUNC(MAK, ER, int a, _COMMA, int b);
 // CHECK: void func(int a , int b) {}
+
+#define macro(a, b) (a - b)
+void function(int a);
+#define COMMA_ELIDER(...) \
+  macro(x, __VA_ARGS__); \
+  function(x, __VA_ARGS__);
+COMMA_ELIDER();
+// CHECK: (x - );
+// CHECK: function(x);
diff --git a/test/Preprocessor/pic.c b/test/Preprocessor/pic.c
index 3e649ee5ea2d1..ec8c9542f001e 100644
--- a/test/Preprocessor/pic.c
+++ b/test/Preprocessor/pic.c
@@ -19,16 +19,16 @@
 // CHECK-PIC2: #define __pic__ 2
 // CHECK-PIC2-NOT: #define __pie__
 //
-// RUN: %clang_cc1 -pie-level 1 -dM -E -o - %s \
+// RUN: %clang_cc1 -pic-level 1 -pic-is-pie -dM -E -o - %s \
 // RUN:   | FileCheck --check-prefix=CHECK-PIE1 %s
-// CHECK-PIE1-NOT: #define __PIC__
+// CHECK-PIE1: #define __PIC__ 1
 // CHECK-PIE1: #define __PIE__ 1
-// CHECK-PIE1-NOT: #define __pic__
+// CHECK-PIE1: #define __pic__ 1
 // CHECK-PIE1: #define __pie__ 1
 //
-// RUN: %clang_cc1 -pie-level 2 -dM -E -o - %s \
+// RUN: %clang_cc1 -pic-level 2 -pic-is-pie -dM -E -o - %s \
 // RUN:   | FileCheck --check-prefix=CHECK-PIE2 %s
-// CHECK-PIE2-NOT: #define __PIC__
+// CHECK-PIE2: #define __PIC__ 2
 // CHECK-PIE2: #define __PIE__ 2
-// CHECK-PIE2-NOT: #define __pic__
+// CHECK-PIE2: #define __pic__ 2
 // CHECK-PIE2: #define __pie__ 2
diff --git a/test/Preprocessor/pragma_diagnostic.c b/test/Preprocessor/pragma_diagnostic.c
index e8a67abb79eb1..3970dbbc8e2fa 100644
--- a/test/Preprocessor/pragma_diagnostic.c
+++ b/test/Preprocessor/pragma_diagnostic.c
@@ -30,3 +30,18 @@
 
 #pragma GCC diagnostic error "-Winvalid-name"  // expected-warning {{unknown warning group '-Winvalid-name', ignored}}
 
+
+// Testing pragma clang diagnostic with -Weverything
+void ppo(){} // First test that we do not diagnose on this.
+
+#pragma clang diagnostic warning "-Weverything"
+void ppp(){} // expected-warning {{no previous prototype for function 'ppp'}}
+
+#pragma clang diagnostic ignored "-Weverything" // Reset it.
+void ppq(){}
+
+#pragma clang diagnostic error "-Weverything" // Now set to error
+void ppr(){} // expected-error {{no previous prototype for function 'ppr'}}
+
+#pragma clang diagnostic warning "-Weverything" // This should not be effective
+void pps(){} // expected-error {{no previous prototype for function 'pps'}}
diff --git a/test/Preprocessor/predefined-arch-macros.c b/test/Preprocessor/predefined-arch-macros.c
index ea98e7fb0fe5e..18a75df66e7d6 100644
--- a/test/Preprocessor/predefined-arch-macros.c
+++ b/test/Preprocessor/predefined-arch-macros.c
@@ -2,19 +2,19 @@
 //
 // RUN: %clang -march=i386 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I386_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I386_M32
 // CHECK_I386_M32: #define __i386 1
 // CHECK_I386_M32: #define __i386__ 1
 // CHECK_I386_M32: #define __tune_i386__ 1
 // CHECK_I386_M32: #define i386 1
 // RUN: not %clang -march=i386 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I386_M64
-// CHECK_I386_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I386_M64
+// CHECK_I386_M64: error: {{.*}}
 //
 // RUN: %clang -march=i486 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I486_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I486_M32
 // CHECK_I486_M32: #define __i386 1
 // CHECK_I486_M32: #define __i386__ 1
 // CHECK_I486_M32: #define __i486 1
@@ -23,12 +23,12 @@
 // CHECK_I486_M32: #define i386 1
 // RUN: not %clang -march=i486 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I486_M64
-// CHECK_I486_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I486_M64
+// CHECK_I486_M64: error: {{.*}}
 //
 // RUN: %clang -march=i586 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I586_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I586_M32
 // CHECK_I586_M32: #define __i386 1
 // CHECK_I586_M32: #define __i386__ 1
 // CHECK_I586_M32: #define __i586 1
@@ -40,12 +40,12 @@
 // CHECK_I586_M32: #define i386 1
 // RUN: not %clang -march=i586 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I586_M64
-// CHECK_I586_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I586_M64
+// CHECK_I586_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M32
 // CHECK_PENTIUM_M32: #define __i386 1
 // CHECK_PENTIUM_M32: #define __i386__ 1
 // CHECK_PENTIUM_M32: #define __i586 1
@@ -57,12 +57,12 @@
 // CHECK_PENTIUM_M32: #define i386 1
 // RUN: not %clang -march=pentium -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M64
-// CHECK_PENTIUM_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M64
+// CHECK_PENTIUM_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium-mmx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_MMX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_MMX_M32
 // CHECK_PENTIUM_MMX_M32: #define __MMX__ 1
 // CHECK_PENTIUM_MMX_M32: #define __i386 1
 // CHECK_PENTIUM_MMX_M32: #define __i386__ 1
@@ -77,12 +77,12 @@
 // CHECK_PENTIUM_MMX_M32: #define i386 1
 // RUN: not %clang -march=pentium-mmx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_MMX_M64
-// CHECK_PENTIUM_MMX_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_MMX_M64
+// CHECK_PENTIUM_MMX_M64: error: {{.*}}
 //
 // RUN: %clang -march=winchip-c6 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP_C6_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP_C6_M32
 // CHECK_WINCHIP_C6_M32: #define __MMX__ 1
 // CHECK_WINCHIP_C6_M32: #define __i386 1
 // CHECK_WINCHIP_C6_M32: #define __i386__ 1
@@ -92,12 +92,12 @@
 // CHECK_WINCHIP_C6_M32: #define i386 1
 // RUN: not %clang -march=winchip-c6 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP_C6_M64
-// CHECK_WINCHIP_C6_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP_C6_M64
+// CHECK_WINCHIP_C6_M64: error: {{.*}}
 //
 // RUN: %clang -march=winchip2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M32
 // CHECK_WINCHIP2_M32: #define __3dNOW__ 1
 // CHECK_WINCHIP2_M32: #define __MMX__ 1
 // CHECK_WINCHIP2_M32: #define __i386 1
@@ -108,12 +108,12 @@
 // CHECK_WINCHIP2_M32: #define i386 1
 // RUN: not %clang -march=winchip2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_WINCHIP2_M64
-// CHECK_WINCHIP2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_WINCHIP2_M64
+// CHECK_WINCHIP2_M64: error: {{.*}}
 //
 // RUN: %clang -march=c3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M32
 // CHECK_C3_M32: #define __3dNOW__ 1
 // CHECK_C3_M32: #define __MMX__ 1
 // CHECK_C3_M32: #define __i386 1
@@ -124,12 +124,12 @@
 // CHECK_C3_M32: #define i386 1
 // RUN: not %clang -march=c3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_M64
-// CHECK_C3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_M64
+// CHECK_C3_M64: error: {{.*}}
 //
 // RUN: %clang -march=c3-2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_2_M32
 // CHECK_C3_2_M32: #define __MMX__ 1
 // CHECK_C3_2_M32: #define __SSE__ 1
 // CHECK_C3_2_M32: #define __i386 1
@@ -144,12 +144,12 @@
 // CHECK_C3_2_M32: #define i386 1
 // RUN: not %clang -march=c3-2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_C3_2_M64
-// CHECK_C3_2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_C3_2_M64
+// CHECK_C3_2_M64: error: {{.*}}
 //
 // RUN: %clang -march=i686 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I686_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I686_M32
 // CHECK_I686_M32: #define __i386 1
 // CHECK_I686_M32: #define __i386__ 1
 // CHECK_I686_M32: #define __i686 1
@@ -159,12 +159,12 @@
 // CHECK_I686_M32: #define i386 1
 // RUN: not %clang -march=i686 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_I686_M64
-// CHECK_I686_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_I686_M64
+// CHECK_I686_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentiumpro -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUMPRO_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUMPRO_M32
 // CHECK_PENTIUMPRO_M32: #define __i386 1
 // CHECK_PENTIUMPRO_M32: #define __i386__ 1
 // CHECK_PENTIUMPRO_M32: #define __i686 1
@@ -176,12 +176,12 @@
 // CHECK_PENTIUMPRO_M32: #define i386 1
 // RUN: not %clang -march=pentiumpro -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUMPRO_M64
-// CHECK_PENTIUMPRO_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUMPRO_M64
+// CHECK_PENTIUMPRO_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM2_M32
 // CHECK_PENTIUM2_M32: #define __MMX__ 1
 // CHECK_PENTIUM2_M32: #define __i386 1
 // CHECK_PENTIUM2_M32: #define __i386__ 1
@@ -195,12 +195,12 @@
 // CHECK_PENTIUM2_M32: #define i386 1
 // RUN: not %clang -march=pentium2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM2_M64
-// CHECK_PENTIUM2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM2_M64
+// CHECK_PENTIUM2_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3_M32
 // CHECK_PENTIUM3_M32: #define __MMX__ 1
 // CHECK_PENTIUM3_M32: #define __SSE__ 1
 // CHECK_PENTIUM3_M32: #define __i386 1
@@ -216,12 +216,12 @@
 // CHECK_PENTIUM3_M32: #define i386 1
 // RUN: not %clang -march=pentium3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3_M64
-// CHECK_PENTIUM3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3_M64
+// CHECK_PENTIUM3_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium3m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3M_M32
 // CHECK_PENTIUM3M_M32: #define __MMX__ 1
 // CHECK_PENTIUM3M_M32: #define __SSE__ 1
 // CHECK_PENTIUM3M_M32: #define __i386 1
@@ -235,12 +235,12 @@
 // CHECK_PENTIUM3M_M32: #define i386 1
 // RUN: not %clang -march=pentium3m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM3M_M64
-// CHECK_PENTIUM3M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM3M_M64
+// CHECK_PENTIUM3M_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium-m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M_M32
 // CHECK_PENTIUM_M_M32: #define __MMX__ 1
 // CHECK_PENTIUM_M_M32: #define __SSE2__ 1
 // CHECK_PENTIUM_M_M32: #define __SSE__ 1
@@ -255,12 +255,12 @@
 // CHECK_PENTIUM_M_M32: #define i386 1
 // RUN: not %clang -march=pentium-m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM_M_M64
-// CHECK_PENTIUM_M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM_M_M64
+// CHECK_PENTIUM_M_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4_M32
 // CHECK_PENTIUM4_M32: #define __MMX__ 1
 // CHECK_PENTIUM4_M32: #define __SSE2__ 1
 // CHECK_PENTIUM4_M32: #define __SSE__ 1
@@ -272,12 +272,12 @@
 // CHECK_PENTIUM4_M32: #define i386 1
 // RUN: not %clang -march=pentium4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4_M64
-// CHECK_PENTIUM4_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4_M64
+// CHECK_PENTIUM4_M64: error: {{.*}}
 //
 // RUN: %clang -march=pentium4m -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4M_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4M_M32
 // CHECK_PENTIUM4M_M32: #define __MMX__ 1
 // CHECK_PENTIUM4M_M32: #define __SSE2__ 1
 // CHECK_PENTIUM4M_M32: #define __SSE__ 1
@@ -289,12 +289,12 @@
 // CHECK_PENTIUM4M_M32: #define i386 1
 // RUN: not %clang -march=pentium4m -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PENTIUM4M_M64
-// CHECK_PENTIUM4M_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PENTIUM4M_M64
+// CHECK_PENTIUM4M_M64: error: {{.*}}
 //
 // RUN: %clang -march=prescott -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PRESCOTT_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PRESCOTT_M32
 // CHECK_PRESCOTT_M32: #define __MMX__ 1
 // CHECK_PRESCOTT_M32: #define __SSE2__ 1
 // CHECK_PRESCOTT_M32: #define __SSE3__ 1
@@ -307,12 +307,12 @@
 // CHECK_PRESCOTT_M32: #define i386 1
 // RUN: not %clang -march=prescott -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PRESCOTT_M64
-// CHECK_PRESCOTT_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PRESCOTT_M64
+// CHECK_PRESCOTT_M64: error: {{.*}}
 //
 // RUN: %clang -march=nocona -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_NOCONA_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_NOCONA_M32
 // CHECK_NOCONA_M32: #define __MMX__ 1
 // CHECK_NOCONA_M32: #define __SSE2__ 1
 // CHECK_NOCONA_M32: #define __SSE3__ 1
@@ -325,7 +325,7 @@
 // CHECK_NOCONA_M32: #define i386 1
 // RUN: %clang -march=nocona -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_NOCONA_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_NOCONA_M64
 // CHECK_NOCONA_M64: #define __MMX__ 1
 // CHECK_NOCONA_M64: #define __SSE2_MATH__ 1
 // CHECK_NOCONA_M64: #define __SSE2__ 1
@@ -342,7 +342,7 @@
 //
 // RUN: %clang -march=core2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE2_M32
 // CHECK_CORE2_M32: #define __MMX__ 1
 // CHECK_CORE2_M32: #define __SSE2__ 1
 // CHECK_CORE2_M32: #define __SSE3__ 1
@@ -356,7 +356,7 @@
 // CHECK_CORE2_M32: #define i386 1
 // RUN: %clang -march=core2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE2_M64
 // CHECK_CORE2_M64: #define __MMX__ 1
 // CHECK_CORE2_M64: #define __SSE2_MATH__ 1
 // CHECK_CORE2_M64: #define __SSE2__ 1
@@ -374,7 +374,7 @@
 //
 // RUN: %clang -march=corei7 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_M32
 // CHECK_COREI7_M32: #define __MMX__ 1
 // CHECK_COREI7_M32: #define __POPCNT__ 1
 // CHECK_COREI7_M32: #define __SSE2__ 1
@@ -391,7 +391,7 @@
 // CHECK_COREI7_M32: #define i386 1
 // RUN: %clang -march=corei7 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_M64
 // CHECK_COREI7_M64: #define __MMX__ 1
 // CHECK_COREI7_M64: #define __POPCNT__ 1
 // CHECK_COREI7_M64: #define __SSE2_MATH__ 1
@@ -412,7 +412,7 @@
 //
 // RUN: %clang -march=corei7-avx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_AVX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_AVX_M32
 // CHECK_COREI7_AVX_M32: #define __AES__ 1
 // CHECK_COREI7_AVX_M32: #define __AVX__ 1
 // CHECK_COREI7_AVX_M32: #define __MMX__ 1
@@ -435,7 +435,7 @@
 // CHECK_COREI7_AVX_M32: #define i386 1
 // RUN: %clang -march=corei7-avx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_COREI7_AVX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_COREI7_AVX_M64
 // CHECK_COREI7_AVX_M64: #define __AES__ 1
 // CHECK_COREI7_AVX_M64: #define __AVX__ 1
 // CHECK_COREI7_AVX_M64: #define __MMX__ 1
@@ -462,7 +462,7 @@
 //
 // RUN: %clang -march=core-avx-i -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX_I_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX_I_M32
 // CHECK_CORE_AVX_I_M32: #define __AES__ 1
 // CHECK_CORE_AVX_I_M32: #define __AVX__ 1
 // CHECK_CORE_AVX_I_M32: #define __F16C__ 1
@@ -485,7 +485,7 @@
 // CHECK_CORE_AVX_I_M32: #define i386 1
 // RUN: %clang -march=core-avx-i -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX_I_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX_I_M64
 // CHECK_CORE_AVX_I_M64: #define __AES__ 1
 // CHECK_CORE_AVX_I_M64: #define __AVX__ 1
 // CHECK_CORE_AVX_I_M64: #define __F16C__ 1
@@ -512,7 +512,7 @@
 //
 // RUN: %clang -march=core-avx2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX2_M32
 // CHECK_CORE_AVX2_M32: #define __AES__ 1
 // CHECK_CORE_AVX2_M32: #define __AVX2__ 1
 // CHECK_CORE_AVX2_M32: #define __AVX__ 1
@@ -542,7 +542,7 @@
 // CHECK_CORE_AVX2_M32: #define i386 1
 // RUN: %clang -march=core-avx2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_CORE_AVX2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CORE_AVX2_M64
 // CHECK_CORE_AVX2_M64: #define __AES__ 1
 // CHECK_CORE_AVX2_M64: #define __AVX2__ 1
 // CHECK_CORE_AVX2_M64: #define __AVX__ 1
@@ -576,7 +576,7 @@
 //
 // RUN: %clang -march=broadwell -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BROADWELL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BROADWELL_M32
 // CHECK_BROADWELL_M32: #define __ADX__ 1
 // CHECK_BROADWELL_M32: #define __AES__ 1
 // CHECK_BROADWELL_M32: #define __AVX2__ 1
@@ -608,7 +608,7 @@
 // CHECK_BROADWELL_M32: #define i386 1
 // RUN: %clang -march=broadwell -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BROADWELL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BROADWELL_M64
 // CHECK_BROADWELL_M64: #define __ADX__ 1
 // CHECK_BROADWELL_M64: #define __AES__ 1
 // CHECK_BROADWELL_M64: #define __AVX2__ 1
@@ -642,9 +642,74 @@
 // CHECK_BROADWELL_M64: #define __x86_64 1
 // CHECK_BROADWELL_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=skylake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKL_M32
+// CHECK_SKL_M32: #define __ADX__ 1
+// CHECK_SKL_M32: #define __AES__ 1
+// CHECK_SKL_M32: #define __AVX2__ 1
+// CHECK_SKL_M32: #define __AVX__ 1
+// CHECK_SKL_M32: #define __BMI2__ 1
+// CHECK_SKL_M32: #define __BMI__ 1
+// CHECK_SKL_M32: #define __F16C__ 1
+// CHECK_SKL_M32: #define __FMA__ 1
+// CHECK_SKL_M32: #define __LZCNT__ 1
+// CHECK_SKL_M32: #define __MMX__ 1
+// CHECK_SKL_M32: #define __PCLMUL__ 1
+// CHECK_SKL_M32: #define __POPCNT__ 1
+// CHECK_SKL_M32: #define __RDRND__ 1
+// CHECK_SKL_M32: #define __RDSEED__ 1
+// CHECK_SKL_M32: #define __RTM__ 1
+// CHECK_SKL_M32: #define __SSE2__ 1
+// CHECK_SKL_M32: #define __SSE3__ 1
+// CHECK_SKL_M32: #define __SSE4_1__ 1
+// CHECK_SKL_M32: #define __SSE4_2__ 1
+// CHECK_SKL_M32: #define __SSE__ 1
+// CHECK_SKL_M32: #define __SSSE3__ 1
+// CHECK_SKL_M32: #define __XSAVEC__ 1
+// CHECK_SKL_M32: #define __XSAVEOPT__ 1
+// CHECK_SKL_M32: #define __XSAVES__ 1
+// CHECK_SKL_M32: #define __XSAVE__ 1
+// CHECK_SKL_M32: #define i386 1
+
+// RUN: %clang -march=skylake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKL_M64
+// CHECK_SKL_M64: #define __ADX__ 1
+// CHECK_SKL_M64: #define __AES__ 1
+// CHECK_SKL_M64: #define __AVX2__ 1
+// CHECK_SKL_M64: #define __AVX__ 1
+// CHECK_SKL_M64: #define __BMI2__ 1
+// CHECK_SKL_M64: #define __BMI__ 1
+// CHECK_SKL_M64: #define __F16C__ 1
+// CHECK_SKL_M64: #define __FMA__ 1
+// CHECK_SKL_M64: #define __LZCNT__ 1
+// CHECK_SKL_M64: #define __MMX__ 1
+// CHECK_SKL_M64: #define __PCLMUL__ 1
+// CHECK_SKL_M64: #define __POPCNT__ 1
+// CHECK_SKL_M64: #define __RDRND__ 1
+// CHECK_SKL_M64: #define __RDSEED__ 1
+// CHECK_SKL_M64: #define __RTM__ 1
+// CHECK_SKL_M64: #define __SSE2_MATH__ 1
+// CHECK_SKL_M64: #define __SSE2__ 1
+// CHECK_SKL_M64: #define __SSE3__ 1
+// CHECK_SKL_M64: #define __SSE4_1__ 1
+// CHECK_SKL_M64: #define __SSE4_2__ 1
+// CHECK_SKL_M64: #define __SSE_MATH__ 1
+// CHECK_SKL_M64: #define __SSE__ 1
+// CHECK_SKL_M64: #define __SSSE3__ 1
+// CHECK_SKL_M64: #define __XSAVEC__ 1
+// CHECK_SKL_M64: #define __XSAVEOPT__ 1
+// CHECK_SKL_M64: #define __XSAVES__ 1
+// CHECK_SKL_M64: #define __XSAVE__ 1
+// CHECK_SKL_M64: #define __amd64 1
+// CHECK_SKL_M64: #define __amd64__ 1
+// CHECK_SKL_M64: #define __x86_64 1
+// CHECK_SKL_M64: #define __x86_64__ 1
+
 // RUN: %clang -march=knl -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_KNL_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_KNL_M32
 // CHECK_KNL_M32: #define __AES__ 1
 // CHECK_KNL_M32: #define __AVX2__ 1
 // CHECK_KNL_M32: #define __AVX512CD__ 1
@@ -679,7 +744,7 @@
 
 // RUN: %clang -march=knl -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_KNL_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_KNL_M64
 // CHECK_KNL_M64: #define __AES__ 1
 // CHECK_KNL_M64: #define __AVX2__ 1
 // CHECK_KNL_M64: #define __AVX512CD__ 1
@@ -715,9 +780,9 @@
 // CHECK_KNL_M64: #define __x86_64 1
 // CHECK_KNL_M64: #define __x86_64__ 1
 //
-// RUN: %clang -march=skx -m32 -E -dM %s -o - 2>&1 \
+// RUN: %clang -march=skylake-avx512 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SKX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKX_M32
 // CHECK_SKX_M32: #define __AES__ 1
 // CHECK_SKX_M32: #define __AVX2__ 1
 // CHECK_SKX_M32: #define __AVX512BW__ 1
@@ -753,9 +818,9 @@
 // CHECK_SKX_M32: #define __tune_skx__ 1
 // CHECK_SKX_M32: #define i386 1
 
-// RUN: %clang -march=skx -m64 -E -dM %s -o - 2>&1 \
+// RUN: %clang -march=skylake-avx512 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SKX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SKX_M64
 // CHECK_SKX_M64: #define __AES__ 1
 // CHECK_SKX_M64: #define __AVX2__ 1
 // CHECK_SKX_M64: #define __AVX512BW__ 1
@@ -794,9 +859,86 @@
 // CHECK_SKX_M64: #define __x86_64 1
 // CHECK_SKX_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=cannonlake -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CNL_M32
+// CHECK_CNL_M32: #define __AES__ 1
+// CHECK_CNL_M32: #define __AVX2__ 1
+// CHECK_CNL_M32: #define __AVX512BW__ 1
+// CHECK_CNL_M32: #define __AVX512CD__ 1
+// CHECK_CNL_M32: #define __AVX512DQ__ 1
+// CHECK_CNL_M32: #define __AVX512F__ 1
+// CHECK_CNL_M32: #define __AVX512IFMA__ 1
+// CHECK_CNL_M32: #define __AVX512VBMI__ 1
+// CHECK_CNL_M32: #define __AVX512VL__ 1
+// CHECK_CNL_M32: #define __AVX__ 1
+// CHECK_CNL_M32: #define __BMI2__ 1
+// CHECK_CNL_M32: #define __BMI__ 1
+// CHECK_CNL_M32: #define __F16C__ 1
+// CHECK_CNL_M32: #define __FMA__ 1
+// CHECK_CNL_M32: #define __LZCNT__ 1
+// CHECK_CNL_M32: #define __MMX__ 1
+// CHECK_CNL_M32: #define __PCLMUL__ 1
+// CHECK_CNL_M32: #define __POPCNT__ 1
+// CHECK_CNL_M32: #define __RDRND__ 1
+// CHECK_CNL_M32: #define __RTM__ 1
+// CHECK_CNL_M32: #define __SHA__ 1
+// CHECK_CNL_M32: #define __SSE2__ 1
+// CHECK_CNL_M32: #define __SSE3__ 1
+// CHECK_CNL_M32: #define __SSE4_1__ 1
+// CHECK_CNL_M32: #define __SSE4_2__ 1
+// CHECK_CNL_M32: #define __SSE__ 1
+// CHECK_CNL_M32: #define __SSSE3__ 1
+// CHECK_CNL_M32: #define __XSAVEC__ 1
+// CHECK_CNL_M32: #define __XSAVEOPT__ 1
+// CHECK_CNL_M32: #define __XSAVES__ 1
+// CHECK_CNL_M32: #define __XSAVE__ 1
+// CHECK_CNL_M32: #define __i386 1
+// CHECK_CNL_M32: #define __i386__ 1
+// CHECK_CNL_M32: #define i386 1
+//
+// RUN: %clang -march=cannonlake -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_CNL_M64
+// CHECK_CNL_M64: #define __AES__ 1
+// CHECK_CNL_M64: #define __AVX2__ 1
+// CHECK_CNL_M64: #define __AVX512BW__ 1
+// CHECK_CNL_M64: #define __AVX512CD__ 1
+// CHECK_CNL_M64: #define __AVX512DQ__ 1
+// CHECK_CNL_M64: #define __AVX512F__ 1
+// CHECK_CNL_M64: #define __AVX512IFMA__ 1
+// CHECK_CNL_M64: #define __AVX512VBMI__ 1
+// CHECK_CNL_M64: #define __AVX512VL__ 1
+// CHECK_CNL_M64: #define __AVX__ 1
+// CHECK_CNL_M64: #define __BMI2__ 1
+// CHECK_CNL_M64: #define __BMI__ 1
+// CHECK_CNL_M64: #define __F16C__ 1
+// CHECK_CNL_M64: #define __FMA__ 1
+// CHECK_CNL_M64: #define __LZCNT__ 1
+// CHECK_CNL_M64: #define __MMX__ 1
+// CHECK_CNL_M64: #define __PCLMUL__ 1
+// CHECK_CNL_M64: #define __POPCNT__ 1
+// CHECK_CNL_M64: #define __RDRND__ 1
+// CHECK_CNL_M64: #define __RTM__ 1
+// CHECK_CNL_M64: #define __SHA__ 1
+// CHECK_CNL_M64: #define __SSE2__ 1
+// CHECK_CNL_M64: #define __SSE3__ 1
+// CHECK_CNL_M64: #define __SSE4_1__ 1
+// CHECK_CNL_M64: #define __SSE4_2__ 1
+// CHECK_CNL_M64: #define __SSE__ 1
+// CHECK_CNL_M64: #define __SSSE3__ 1
+// CHECK_CNL_M64: #define __XSAVEC__ 1
+// CHECK_CNL_M64: #define __XSAVEOPT__ 1
+// CHECK_CNL_M64: #define __XSAVES__ 1
+// CHECK_CNL_M64: #define __XSAVE__ 1
+// CHECK_CNL_M64: #define __amd64 1
+// CHECK_CNL_M64: #define __amd64__ 1
+// CHECK_CNL_M64: #define __x86_64 1
+// CHECK_CNL_M64: #define __x86_64__ 1
+
 // RUN: %clang -march=atom -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATOM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M32
 // CHECK_ATOM_M32: #define __MMX__ 1
 // CHECK_ATOM_M32: #define __SSE2__ 1
 // CHECK_ATOM_M32: #define __SSE3__ 1
@@ -810,7 +952,7 @@
 // CHECK_ATOM_M32: #define i386 1
 // RUN: %clang -march=atom -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATOM_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M64
 // CHECK_ATOM_M64: #define __MMX__ 1
 // CHECK_ATOM_M64: #define __SSE2_MATH__ 1
 // CHECK_ATOM_M64: #define __SSE2__ 1
@@ -828,7 +970,7 @@
 //
 // RUN: %clang -march=slm -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SLM_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M32
 // CHECK_SLM_M32: #define __MMX__ 1
 // CHECK_SLM_M32: #define __SSE2__ 1
 // CHECK_SLM_M32: #define __SSE3__ 1
@@ -844,7 +986,7 @@
 // CHECK_SLM_M32: #define i386 1
 // RUN: %clang -march=slm -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SLM_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M64
 // CHECK_SLM_M64: #define __MMX__ 1
 // CHECK_SLM_M64: #define __SSE2_MATH__ 1
 // CHECK_SLM_M64: #define __SSE2__ 1
@@ -862,9 +1004,21 @@
 // CHECK_SLM_M64: #define __x86_64 1
 // CHECK_SLM_M64: #define __x86_64__ 1
 //
+// RUN: %clang -march=lakemont -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck %s -check-prefix=CHECK_LMT_M32
+// CHECK_LMT_M32: #define __i386 1
+// CHECK_LMT_M32: #define __i386__ 1
+// CHECK_LMT_M32: #define __tune_lakemont__ 1
+// CHECK_LMT_M32: #define i386 1
+// RUN: not %clang -march=lakemont -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck %s -check-prefix=CHECK_LMT_M64
+// CHECK_LMT_M64: error:
+//
 // RUN: %clang -march=geode -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_GEODE_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M32
 // CHECK_GEODE_M32: #define __3dNOW_A__ 1
 // CHECK_GEODE_M32: #define __3dNOW__ 1
 // CHECK_GEODE_M32: #define __MMX__ 1
@@ -876,12 +1030,12 @@
 // CHECK_GEODE_M32: #define i386 1
 // RUN: not %clang -march=geode -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_GEODE_M64
-// CHECK_GEODE_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_GEODE_M64
+// CHECK_GEODE_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_M32
 // CHECK_K6_M32: #define __MMX__ 1
 // CHECK_K6_M32: #define __i386 1
 // CHECK_K6_M32: #define __i386__ 1
@@ -891,12 +1045,12 @@
 // CHECK_K6_M32: #define i386 1
 // RUN: not %clang -march=k6 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_M64
-// CHECK_K6_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_M64
+// CHECK_K6_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6-2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M32
 // CHECK_K6_2_M32: #define __3dNOW__ 1
 // CHECK_K6_2_M32: #define __MMX__ 1
 // CHECK_K6_2_M32: #define __i386 1
@@ -909,12 +1063,12 @@
 // CHECK_K6_2_M32: #define i386 1
 // RUN: not %clang -march=k6-2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_2_M64
-// CHECK_K6_2_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_2_M64
+// CHECK_K6_2_M64: error: {{.*}}
 //
 // RUN: %clang -march=k6-3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M32
 // CHECK_K6_3_M32: #define __3dNOW__ 1
 // CHECK_K6_3_M32: #define __MMX__ 1
 // CHECK_K6_3_M32: #define __i386 1
@@ -927,12 +1081,12 @@
 // CHECK_K6_3_M32: #define i386 1
 // RUN: not %clang -march=k6-3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K6_3_M64
-// CHECK_K6_3_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K6_3_M64
+// CHECK_K6_3_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M32
 // CHECK_ATHLON_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_M32: #define __MMX__ 1
@@ -944,12 +1098,12 @@
 // CHECK_ATHLON_M32: #define i386 1
 // RUN: not %clang -march=athlon -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_M64
-// CHECK_ATHLON_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_M64
+// CHECK_ATHLON_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-tbird -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_TBIRD_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M32
 // CHECK_ATHLON_TBIRD_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_TBIRD_M32: #define __MMX__ 1
@@ -961,12 +1115,12 @@
 // CHECK_ATHLON_TBIRD_M32: #define i386 1
 // RUN: not %clang -march=athlon-tbird -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_TBIRD_M64
-// CHECK_ATHLON_TBIRD_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_TBIRD_M64
+// CHECK_ATHLON_TBIRD_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M32
 // CHECK_ATHLON_4_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_4_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_4_M32: #define __MMX__ 1
@@ -981,12 +1135,12 @@
 // CHECK_ATHLON_4_M32: #define i386 1
 // RUN: not %clang -march=athlon-4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_4_M64
-// CHECK_ATHLON_4_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_4_M64
+// CHECK_ATHLON_4_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-xp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_XP_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M32
 // CHECK_ATHLON_XP_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_XP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_XP_M32: #define __MMX__ 1
@@ -1001,12 +1155,12 @@
 // CHECK_ATHLON_XP_M32: #define i386 1
 // RUN: not %clang -march=athlon-xp -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_XP_M64
-// CHECK_ATHLON_XP_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_XP_M64
+// CHECK_ATHLON_XP_M64: error: {{.*}}
 //
 // RUN: %clang -march=athlon-mp -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_MP_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M32
 // CHECK_ATHLON_MP_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_MP_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_MP_M32: #define __MMX__ 1
@@ -1021,12 +1175,12 @@
 // CHECK_ATHLON_MP_M32: #define i386 1
 // RUN: not %clang -march=athlon-mp -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_MP_M64
-// CHECK_ATHLON_MP_M64: error:
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_MP_M64
+// CHECK_ATHLON_MP_M64: error: {{.*}}
 //
 // RUN: %clang -march=x86-64 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_X86_64_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_X86_64_M32
 // CHECK_X86_64_M32: #define __MMX__ 1
 // CHECK_X86_64_M32: #define __SSE2__ 1
 // CHECK_X86_64_M32: #define __SSE__ 1
@@ -1037,7 +1191,7 @@
 // CHECK_X86_64_M32: #define i386 1
 // RUN: %clang -march=x86-64 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_X86_64_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_X86_64_M64
 // CHECK_X86_64_M64: #define __MMX__ 1
 // CHECK_X86_64_M64: #define __SSE2_MATH__ 1
 // CHECK_X86_64_M64: #define __SSE2__ 1
@@ -1052,7 +1206,7 @@
 //
 // RUN: %clang -march=k8 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M32
 // CHECK_K8_M32: #define __3dNOW_A__ 1
 // CHECK_K8_M32: #define __3dNOW__ 1
 // CHECK_K8_M32: #define __MMX__ 1
@@ -1066,7 +1220,7 @@
 // CHECK_K8_M32: #define i386 1
 // RUN: %clang -march=k8 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_M64
 // CHECK_K8_M64: #define __3dNOW_A__ 1
 // CHECK_K8_M64: #define __3dNOW__ 1
 // CHECK_K8_M64: #define __MMX__ 1
@@ -1084,7 +1238,7 @@
 //
 // RUN: %clang -march=k8-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M32
 // CHECK_K8_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_K8_SSE3_M32: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M32: #define __MMX__ 1
@@ -1099,7 +1253,7 @@
 // CHECK_K8_SSE3_M32: #define i386 1
 // RUN: %clang -march=k8-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_K8_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_K8_SSE3_M64
 // CHECK_K8_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_K8_SSE3_M64: #define __3dNOW__ 1
 // CHECK_K8_SSE3_M64: #define __MMX__ 1
@@ -1118,7 +1272,7 @@
 //
 // RUN: %clang -march=opteron -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M32
 // CHECK_OPTERON_M32: #define __3dNOW_A__ 1
 // CHECK_OPTERON_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_M32: #define __MMX__ 1
@@ -1132,7 +1286,7 @@
 // CHECK_OPTERON_M32: #define i386 1
 // RUN: %clang -march=opteron -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_M64
 // CHECK_OPTERON_M64: #define __3dNOW_A__ 1
 // CHECK_OPTERON_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_M64: #define __MMX__ 1
@@ -1150,7 +1304,7 @@
 //
 // RUN: %clang -march=opteron-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M32
 // CHECK_OPTERON_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_OPTERON_SSE3_M32: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M32: #define __MMX__ 1
@@ -1165,7 +1319,7 @@
 // CHECK_OPTERON_SSE3_M32: #define i386 1
 // RUN: %clang -march=opteron-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_OPTERON_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_OPTERON_SSE3_M64
 // CHECK_OPTERON_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_OPTERON_SSE3_M64: #define __3dNOW__ 1
 // CHECK_OPTERON_SSE3_M64: #define __MMX__ 1
@@ -1184,7 +1338,7 @@
 //
 // RUN: %clang -march=athlon64 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M32
 // CHECK_ATHLON64_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_M32: #define __MMX__ 1
@@ -1198,7 +1352,7 @@
 // CHECK_ATHLON64_M32: #define i386 1
 // RUN: %clang -march=athlon64 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_M64
 // CHECK_ATHLON64_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_M64: #define __MMX__ 1
@@ -1216,7 +1370,7 @@
 //
 // RUN: %clang -march=athlon64-sse3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_SSE3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M32
 // CHECK_ATHLON64_SSE3_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M32: #define __MMX__ 1
@@ -1231,7 +1385,7 @@
 // CHECK_ATHLON64_SSE3_M32: #define i386 1
 // RUN: %clang -march=athlon64-sse3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON64_SSE3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON64_SSE3_M64
 // CHECK_ATHLON64_SSE3_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __3dNOW__ 1
 // CHECK_ATHLON64_SSE3_M64: #define __MMX__ 1
@@ -1250,7 +1404,7 @@
 //
 // RUN: %clang -march=athlon-fx -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_FX_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M32
 // CHECK_ATHLON_FX_M32: #define __3dNOW_A__ 1
 // CHECK_ATHLON_FX_M32: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M32: #define __MMX__ 1
@@ -1264,7 +1418,7 @@
 // CHECK_ATHLON_FX_M32: #define i386 1
 // RUN: %clang -march=athlon-fx -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_ATHLON_FX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ATHLON_FX_M64
 // CHECK_ATHLON_FX_M64: #define __3dNOW_A__ 1
 // CHECK_ATHLON_FX_M64: #define __3dNOW__ 1
 // CHECK_ATHLON_FX_M64: #define __MMX__ 1
@@ -1281,7 +1435,7 @@
 // CHECK_ATHLON_FX_M64: #define __x86_64__ 1
 // RUN: %clang -march=amdfam10 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_AMDFAM10_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M32
 // CHECK_AMDFAM10_M32: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M32: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M32: #define __LZCNT__ 1
@@ -1300,7 +1454,7 @@
 // CHECK_AMDFAM10_M32: #define __tune_amdfam10__ 1
 // RUN: %clang -march=amdfam10 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_AMDFAM10_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
 // CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M64: #define __3dNOW__ 1
 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1
@@ -1321,7 +1475,7 @@
 // CHECK_AMDFAM10_M64: #define __x86_64__ 1
 // RUN: %clang -march=btver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER1_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M32
 // CHECK_BTVER1_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M32: #define __LZCNT__ 1
@@ -1335,7 +1489,6 @@
 // CHECK_BTVER1_M32: #define __SSE_MATH__ 1
 // CHECK_BTVER1_M32: #define __SSE__ 1
 // CHECK_BTVER1_M32: #define __SSSE3__ 1
-// CHECK_BTVER1_M32: #define __XSAVE__ 1
 // CHECK_BTVER1_M32: #define __btver1 1
 // CHECK_BTVER1_M32: #define __btver1__ 1
 // CHECK_BTVER1_M32: #define __i386 1
@@ -1343,7 +1496,7 @@
 // CHECK_BTVER1_M32: #define __tune_btver1__ 1
 // RUN: %clang -march=btver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER1_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER1_M64
 // CHECK_BTVER1_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER1_M64: #define __LZCNT__ 1
@@ -1357,7 +1510,6 @@
 // CHECK_BTVER1_M64: #define __SSE_MATH__ 1
 // CHECK_BTVER1_M64: #define __SSE__ 1
 // CHECK_BTVER1_M64: #define __SSSE3__ 1
-// CHECK_BTVER1_M64: #define __XSAVE__ 1
 // CHECK_BTVER1_M64: #define __amd64 1
 // CHECK_BTVER1_M64: #define __amd64__ 1
 // CHECK_BTVER1_M64: #define __btver1 1
@@ -1367,7 +1519,7 @@
 // CHECK_BTVER1_M64: #define __x86_64__ 1
 // RUN: %clang -march=btver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M32
 // CHECK_BTVER2_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M32: #define __AES__ 1
@@ -1395,7 +1547,7 @@
 // CHECK_BTVER2_M32: #define __tune_btver2__ 1
 // RUN: %clang -march=btver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BTVER2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BTVER2_M64
 // CHECK_BTVER2_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BTVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BTVER2_M64: #define __AES__ 1
@@ -1425,7 +1577,7 @@
 // CHECK_BTVER2_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver1 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER1_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M32
 // CHECK_BDVER1_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER1_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M32: #define __AES__ 1
@@ -1454,7 +1606,7 @@
 // CHECK_BDVER1_M32: #define __tune_bdver1__ 1
 // RUN: %clang -march=bdver1 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER1_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER1_M64
 // CHECK_BDVER1_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER1_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER1_M64: #define __AES__ 1
@@ -1485,7 +1637,7 @@
 // CHECK_BDVER1_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver2 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER2_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M32
 // CHECK_BDVER2_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER2_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M32: #define __AES__ 1
@@ -1518,7 +1670,7 @@
 // CHECK_BDVER2_M32: #define __tune_bdver2__ 1
 // RUN: %clang -march=bdver2 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER2_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER2_M64
 // CHECK_BDVER2_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER2_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER2_M64: #define __AES__ 1
@@ -1553,7 +1705,7 @@
 // CHECK_BDVER2_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver3 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER3_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M32
 // CHECK_BDVER3_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER3_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M32: #define __AES__ 1
@@ -1588,7 +1740,7 @@
 // CHECK_BDVER3_M32: #define __tune_bdver3__ 1
 // RUN: %clang -march=bdver3 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER3_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER3_M64
 // CHECK_BDVER3_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER3_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER3_M64: #define __AES__ 1
@@ -1625,7 +1777,7 @@
 // CHECK_BDVER3_M64: #define __x86_64__ 1
 // RUN: %clang -march=bdver4 -m32 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER4_M32
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M32
 // CHECK_BDVER4_M32-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER4_M32-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M32: #define __AES__ 1
@@ -1661,7 +1813,7 @@
 // CHECK_BDVER4_M32: #define __tune_bdver4__ 1
 // RUN: %clang -march=bdver4 -m64 -E -dM %s -o - 2>&1 \
 // RUN:     -target i386-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_BDVER4_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_BDVER4_M64
 // CHECK_BDVER4_M64-NOT: #define __3dNOW_A__ 1
 // CHECK_BDVER4_M64-NOT: #define __3dNOW__ 1
 // CHECK_BDVER4_M64: #define __AES__ 1
@@ -1703,36 +1855,36 @@
 // Begin PPC/GCC/Linux tests ----------------
 // RUN: %clang -mvsx -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_VSX_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_VSX_M64
 //
-// CHECK_PPC_VSX_M64: #define __VSX__
+// CHECK_PPC_VSX_M64: #define __VSX__ 1
 //
 // RUN: %clang -mpower8-vector -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_POWER8_VECTOR_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_POWER8_VECTOR_M64
 //
-// CHECK_PPC_POWER8_VECTOR_M64: #define __POWER8_VECTOR__
+// CHECK_PPC_POWER8_VECTOR_M64: #define __POWER8_VECTOR__ 1
 //
 // RUN: %clang -mcrypto -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_CRYPTO_M64
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_CRYPTO_M64
 //
-// CHECK_PPC_CRYPTO_M64: #define __CRYPTO__
+// CHECK_PPC_CRYPTO_M64: #define __CRYPTO__ 1
 //
 // RUN: %clang -mcpu=ppc64 -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-unknown \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 // RUN: %clang -mcpu=pwr8 -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64-unknown-unknown \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target powerpc64le-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_PPC_GCC_ATOMICS
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_PPC_GCC_ATOMICS
 //
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_PPC_GCC_ATOMICS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // End PPC/GCC/Linux tests ------------------
 
@@ -1740,10 +1892,10 @@
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparc-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARC
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC
 // RUN: %clang -mcpu=v9 -E -dM %s -o - 2>&1 \
 // RUN:     -target sparc-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARC-V9
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARC-V9
 //
 // CHECK_SPARC: #define __BIG_ENDIAN__ 1
 // CHECK_SPARC: #define __sparc 1
@@ -1762,16 +1914,25 @@
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparcel-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARCEL
-//
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-1 -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2.1 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-1 -check-prefix=CHECK_SPARCEL
+// RUN: %clang -E -dM %s -o - -target sparcel-myriad -mcpu=myriad2.2 2>&1 \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_MYRIAD2-2 -check-prefix=CHECK_SPARCEL
 // CHECK_SPARCEL: #define __LITTLE_ENDIAN__ 1
+// CHECK_MYRIAD2-1: #define __myriad2 1
+// CHECK_MYRIAD2-1: #define __myriad2__ 1
+// CHECK_MYRIAD2-2: #define __myriad2 2
+// CHECK_MYRIAD2-2: #define __myriad2__ 2
 // CHECK_SPARCEL: #define __sparc 1
 // CHECK_SPARCEL: #define __sparc__ 1
 // CHECK_SPARCEL: #define __sparcv8 1
 //
 // RUN: %clang -E -dM %s -o - 2>&1 \
 // RUN:     -target sparcv9-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SPARCV9
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SPARCV9
 //
 // CHECK_SPARCV9: #define __BIG_ENDIAN__ 1
 // CHECK_SPARCV9: #define __sparc 1
@@ -1785,12 +1946,12 @@
 //
 // RUN: %clang -march=z10 -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_Z10
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_Z10
 //
-// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_Z10: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 // CHECK_SYSTEMZ_Z10: #define __LONG_DOUBLE_128__ 1
 // CHECK_SYSTEMZ_Z10: #define __s390__ 1
 // CHECK_SYSTEMZ_Z10: #define __s390x__ 1
@@ -1798,12 +1959,12 @@
 //
 // RUN: %clang -march=zEC12 -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZEC12
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZEC12
 //
-// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK_SYSTEMZ_ZEC12: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 // CHECK_SYSTEMZ_ZEC12: #define __HTM__ 1
 // CHECK_SYSTEMZ_ZEC12: #define __LONG_DOUBLE_128__ 1
 // CHECK_SYSTEMZ_ZEC12: #define __s390__ 1
@@ -1812,15 +1973,29 @@
 //
 // RUN: %clang -mhtm -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_HTM
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_HTM
 //
 // CHECK_SYSTEMZ_HTM: #define __HTM__ 1
 //
 // RUN: %clang -fzvector -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
 // RUN: %clang -mzvector -E -dM %s -o - 2>&1 \
 // RUN:     -target s390x-unknown-linux \
-// RUN:   | FileCheck %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_SYSTEMZ_ZVECTOR
 //
 // CHECK_SYSTEMZ_ZVECTOR: #define __VEC__ 10301
+
+// Begin amdgcn tests ----------------
+//
+// RUN: %clang -march=amdgcn -E -dM %s -o - 2>&1 \
+// RUN:     -target amdgcn-unknown-unknown \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDGCN
+// CHECK_AMDGCN: #define __AMDGCN__ 1
+
+// Begin r600 tests ----------------
+//
+// RUN: %clang -march=amdgcn -E -dM %s -o - 2>&1 \
+// RUN:     -target r600-unknown-unknown \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_R600
+// CHECK_R600: #define __R600__ 1
diff --git a/test/Preprocessor/predefined-macros.c b/test/Preprocessor/predefined-macros.c
index 1f68a082a9144..7385cd2c93efd 100644
--- a/test/Preprocessor/predefined-macros.c
+++ b/test/Preprocessor/predefined-macros.c
@@ -1,23 +1,25 @@
 // This test verifies that the correct macros are predefined.
 //
-// RUN: %clang_cc1 %s -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=13.00 -o - | FileCheck %s --check-prefix=CHECK-MS
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++1z -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS
 // CHECK-MS: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS: #define _MSC_EXTENSIONS 1
-// CHECK-MS: #define _MSC_VER 1300
+// CHECK-MS: #define _MSC_VER 1900
+// CHECK-MS: #define _MSVC_LANG 201403L
 // CHECK-MS: #define _M_IX86 600
-// CHECK-MS: #define _M_IX86_FP
+// CHECK-MS: #define _M_IX86_FP 0
 // CHECK-MS: #define _WIN32 1
 // CHECK-MS-NOT: #define __STRICT_ANSI__
 // CHECK-MS-NOT: GCC
 // CHECK-MS-NOT: GNU
 // CHECK-MS-NOT: GXX
 //
-// RUN: %clang_cc1 %s -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
-// RUN:     -fms-compatibility-version=13.00 -o - | FileCheck %s --check-prefix=CHECK-MS64
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple x86_64-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++14 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS64
 // CHECK-MS64: #define _INTEGRAL_MAX_BITS 64
 // CHECK-MS64: #define _MSC_EXTENSIONS 1
-// CHECK-MS64: #define _MSC_VER 1300
+// CHECK-MS64: #define _MSC_VER 1900
+// CHECK-MS64: #define _MSVC_LANG 201402L
 // CHECK-MS64: #define _M_AMD64 100
 // CHECK-MS64: #define _M_X64 100
 // CHECK-MS64: #define _WIN64 1
@@ -27,7 +29,7 @@
 // CHECK-MS64-NOT: GXX
 //
 // RUN: %clang_cc1 %s -E -dM -triple i686-pc-win32 -fms-compatibility \
-// RUN:     -o - | FileCheck %s --check-prefix=CHECK-MS-STDINT
+// RUN:     -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-STDINT
 // CHECK-MS-STDINT:#define __INT16_MAX__ 32767
 // CHECK-MS-STDINT:#define __INT32_MAX__ 2147483647
 // CHECK-MS-STDINT:#define __INT64_MAX__ 9223372036854775807LL
@@ -83,66 +85,103 @@
 // CHECK-MS-STDINT:#define __UINT_LEAST8_TYPE__ unsigned char
 //
 // RUN: %clang_cc1 %s -E -dM -ffast-math -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FAST-MATH
-// CHECK-FAST-MATH: #define __FAST_MATH__
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FAST-MATH
+// CHECK-FAST-MATH: #define __FAST_MATH__ 1
 // CHECK-FAST-MATH: #define __FINITE_MATH_ONLY__ 1
 //
 // RUN: %clang_cc1 %s -E -dM -ffinite-math-only -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FINITE-MATH-ONLY
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-ONLY
 // CHECK-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 1
 //
 // RUN: %clang %s -E -dM -fno-finite-math-only -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-NO-FINITE-MATH-ONLY
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-FINITE-MATH-ONLY
 // CHECK-NO-FINITE-MATH-ONLY: #define __FINITE_MATH_ONLY__ 0
 //
 // RUN: %clang_cc1 %s -E -dM -o - \
-// RUN:   | FileCheck %s --check-prefix=CHECK-FINITE-MATH-FLAG-UNDEFINED
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FINITE-MATH-FLAG-UNDEFINED
 // CHECK-FINITE-MATH-FLAG-UNDEFINED: #define __FINITE_MATH_ONLY__ 0
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i386 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I386
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I386
 // CHECK-SYNC_CAS_I386-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i486 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I486
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_I486: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I486
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_I486: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // CHECK-SYNC_CAS_I486-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple i686 -target-cpu i586 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_I586
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_I586: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_I586
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_I586: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv6 -target-cpu arm1136j-s \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARM
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_ARM: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARM
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_ARM: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv7 -target-cpu cortex-a8 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARMv7
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
-// CHECK-SYNC_CAS_ARMv7: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARMv7
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// CHECK-SYNC_CAS_ARMv7: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple armv6 -target-cpu cortex-m0 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_ARMv6
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_ARMv6
 // CHECK-SYNC_CAS_ARMv6-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP
 //
 // RUN: %clang_cc1 %s -E -dM -o - -triple mips -target-cpu mips2 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_MIPS \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_MIPS \
 // RUN:         --check-prefix=CHECK-SYNC_CAS_MIPS32
 // RUN: %clang_cc1 %s -E -dM -o - -triple mips64 -target-cpu mips3 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-SYNC_CAS_MIPS \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-SYNC_CAS_MIPS \
 // RUN:         --check-prefix=CHECK-SYNC_CAS_MIPS64
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
-// CHECK-SYNC_CAS_MIPS:       __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// CHECK-SYNC_CAS_MIPS: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // CHECK-SYNC_CAS_MIPS32-NOT: __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
-// CHECK-SYNC_CAS_MIPS64:     __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8
+// CHECK-SYNC_CAS_MIPS64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+
+// RUN: %clang_cc1 %s -E -dM -o - -x cl \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL10
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL1.1 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL11
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL1.2 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL12
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-std=CL2.0 \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-CL20
+// RUN: %clang_cc1 %s -E -dM -o - -x cl -cl-fast-relaxed-math \
+// RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-FRM
+// CHECK-CL10: #define CL_VERSION_1_0 100
+// CHECK-CL10: #define CL_VERSION_1_1 110
+// CHECK-CL10: #define CL_VERSION_1_2 120
+// CHECK-CL10: #define CL_VERSION_2_0 200
+// CHECK-CL10: #define __OPENCL_C_VERSION__ 100
+// CHECK-CL10-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL11: #define CL_VERSION_1_0 100
+// CHECK-CL11: #define CL_VERSION_1_1 110
+// CHECK-CL11: #define CL_VERSION_1_2 120
+// CHECK-CL11: #define CL_VERSION_2_0 200
+// CHECK-CL11: #define __OPENCL_C_VERSION__ 110
+// CHECK-CL11-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL12: #define CL_VERSION_1_0 100
+// CHECK-CL12: #define CL_VERSION_1_1 110
+// CHECK-CL12: #define CL_VERSION_1_2 120
+// CHECK-CL12: #define CL_VERSION_2_0 200
+// CHECK-CL12: #define __OPENCL_C_VERSION__ 120
+// CHECK-CL12-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-CL20: #define CL_VERSION_1_0 100
+// CHECK-CL20: #define CL_VERSION_1_1 110
+// CHECK-CL20: #define CL_VERSION_1_2 120
+// CHECK-CL20: #define CL_VERSION_2_0 200
+// CHECK-CL20: #define __OPENCL_C_VERSION__ 200
+// CHECK-CL20-NOT: #define __FAST_RELAXED_MATH__ 1
+// CHECK-FRM: #define __FAST_RELAXED_MATH__ 1
+
diff --git a/test/Preprocessor/pushable-diagnostics.c b/test/Preprocessor/pushable-diagnostics.c
index 877eaaa0b366f..6e05d8e1450ef 100644
--- a/test/Preprocessor/pushable-diagnostics.c
+++ b/test/Preprocessor/pushable-diagnostics.c
@@ -15,3 +15,27 @@ int b = 'df'; // no warning.
 int c = 'df';  // expected-warning{{multi-character character constant}}
 
 #pragma clang diagnostic pop // expected-warning{{pragma diagnostic pop could not pop, no matching push}}
+
+// Test -Weverything
+
+void ppo0(){} // first verify that we do not give anything on this
+#pragma clang diagnostic push // now push
+
+#pragma clang diagnostic warning "-Weverything" 
+void ppr1(){} // expected-warning {{no previous prototype for function 'ppr1'}}
+
+#pragma clang diagnostic push // push again
+#pragma clang diagnostic ignored "-Weverything"  // Set to ignore in this level.
+void pps2(){}
+#pragma clang diagnostic warning "-Weverything"  // Set to warning in this level.
+void ppt2(){} // expected-warning {{no previous prototype for function 'ppt2'}}
+#pragma clang diagnostic error "-Weverything"  // Set to error in this level.
+void ppt3(){} // expected-error {{no previous prototype for function 'ppt3'}}
+#pragma clang diagnostic pop // pop should go back to warning level
+
+void pps1(){} // expected-warning {{no previous prototype for function 'pps1'}}
+
+
+#pragma clang diagnostic pop // Another pop should disble it again
+void ppu(){}
+
diff --git a/test/Preprocessor/stringize_misc.c b/test/Preprocessor/stringize_misc.c
index 6c2c78d17ac3a..fc7253e50499b 100644
--- a/test/Preprocessor/stringize_misc.c
+++ b/test/Preprocessor/stringize_misc.c
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -E %s | FileCheck -strict-whitespace %s
+#ifdef TEST1
+// RUN: %clang_cc1 -E %s -DTEST1 | FileCheck -strict-whitespace %s
 
 #define M(x, y) #x #y
 
@@ -28,3 +29,13 @@ START_END( {a=1 , b=2;} ) /* braces are not parentheses */
 M(a COMMA b, (a, b)) 
 // CHECK: "a COMMA b" "(a, b)"
 
+#endif
+
+#ifdef TEST2
+// RUN: %clang_cc1 -fsyntax-only -verify %s -DTEST2
+
+#define HASH #
+#define INVALID() #
+// expected-error@-1{{'#' is not followed by a macro parameter}}
+
+#endif
diff --git a/test/Preprocessor/stringize_space.c b/test/Preprocessor/stringize_space.c
index 2d79d478e39d0..ae70bf18187c6 100644
--- a/test/Preprocessor/stringize_space.c
+++ b/test/Preprocessor/stringize_space.c
@@ -12,3 +12,9 @@ c)
 
 // CHECK: {{^}}"a c"{{$}}
 
+#define str(x) #x
+#define f(x) str(-x)
+f(
+    1)
+
+// CHECK: {{^}}"-1"
diff --git a/test/Preprocessor/sysroot-prefix.c b/test/Preprocessor/sysroot-prefix.c
new file mode 100644
index 0000000000000..08c72f53b44e9
--- /dev/null
+++ b/test/Preprocessor/sysroot-prefix.c
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -v -isysroot /var/empty -I /var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_NO_SYSROOT %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -I =/var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_DEV_NULL %s
+// RUN: %clang_cc1 -v -I =/var/empty/include -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty -isysroot /var/empty/root -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL %s
+// RUN: %clang_cc1 -v -isysroot /var/empty/root -isysroot /var/empty -I =null -E %s -o /dev/null 2>&1 | FileCheck -check-prefix CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL %s
+
+// CHECK-ISYSROOT_NO_SYSROOT: ignoring nonexistent directory "/var/empty/include"
+// CHECK-ISYSROOT_NO_SYSROOT-NOT: ignoring nonexistent directory "/var/empty/var/empty/include"
+
+// CHECK-ISYSROOT_SYSROOT_DEV_NULL: ignoring nonexistent directory "/var/empty/var/empty/include"
+// CHECK-ISYSROOT_SYSROOT_DEV_NULL-NOT: ignoring nonexistent directory "/var/empty"
+
+// CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL: ignoring nonexistent directory "=/var/empty/include"
+// CHECK-NO_ISYSROOT_SYSROOT_DEV_NULL-NOT: ignoring nonexistent directory "/var/empty/include"
+
+// CHECK-ISYSROOT_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
+// CHECK-ISYSROOT_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
+// CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL: ignoring nonexistent directory "/var/empty/root{{.}}null"
+// CHECK-ISYSROOT_ISYSROOT_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
+// CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL: ignoring nonexistent directory "/var/empty{{.}}null"
+// CHECK-ISYSROOT_ISYSROOT_SWAPPED_SYSROOT_NULL-NOT: ignoring nonexistent directory "=null"
+
diff --git a/test/Preprocessor/warning_tests.c b/test/Preprocessor/warning_tests.c
index c0c22ef2d71fd..1f2e884a5882b 100644
--- a/test/Preprocessor/warning_tests.c
+++ b/test/Preprocessor/warning_tests.c
@@ -12,7 +12,7 @@
 #endif
 
 // expected-error@+2 {{expected string literal in '__has_warning'}}
-// expected-error@+1 {{expected value in expression}}
+// expected-error@+1 {{missing ')'}} expected-note@+1 {{match}}
 #if __has_warning(-Wfoo)
 #endif
 
@@ -22,8 +22,7 @@
 #warning Not a valid warning flag
 #endif
 
-// expected-error@+2 {{builtin warning check macro requires a parenthesized string}}
-// expected-error@+1 {{invalid token}}
+// expected-error@+1 {{missing '(' after '__has_warning'}}
 #if __has_warning "not valid"
 #endif
 
@@ -33,7 +32,7 @@
 
 #define MY_ALIAS "-Wparentheses"
 
-// expected-error@+1 2{{expected}}
+// expected-error@+1 {{expected}}
 #if __has_warning(MY_ALIAS)
 #error Alias expansion not allowed
 #endif
diff --git a/test/Preprocessor/x86_target_features.c b/test/Preprocessor/x86_target_features.c
index 9c4192caf3f06..ff79a699ae8d4 100644
--- a/test/Preprocessor/x86_target_features.c
+++ b/test/Preprocessor/x86_target_features.c
@@ -1,4 +1,4 @@
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE4 %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE4 %s
 
 // SSE4: #define __SSE2_MATH__ 1
 // SSE4: #define __SSE2__ 1
@@ -9,11 +9,11 @@
 // SSE4: #define __SSE__ 1
 // SSE4: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4.1 -mno-sse4 -x c -E -dM -o - %s | FileCheck --check-prefix=NOSSE4 %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4.1 -mno-sse4 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOSSE4 %s
 
 // NOSSE4-NOT: #define __SSE4_1__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE %s
+// RUN: %clang -target i386-unknown-unknown -march=core2 -msse4 -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE %s
 
 // SSE-NOT: #define __SSE2_MATH__ 1
 // SSE-NOT: #define __SSE2__ 1
@@ -24,7 +24,7 @@
 // SSE: #define __SSE__ 1
 // SSE-NOT: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -x c -E -dM -o - %s | FileCheck --check-prefix=SSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE2 %s
 
 // SSE2: #define __SSE2_MATH__ 1
 // SSE2: #define __SSE2__ 1
@@ -35,7 +35,7 @@
 // SSE2: #define __SSE__ 1
 // SSE2-NOT: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mno-sse -mavx -x c -E -dM -o - %s | FileCheck --check-prefix=AVX %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mno-sse -mavx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX %s
 
 // AVX: #define __AVX__ 1
 // AVX: #define __SSE2_MATH__ 1
@@ -47,7 +47,7 @@
 // AVX: #define __SSE__ 1
 // AVX: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mxop -mno-avx -x c -E -dM -o - %s | FileCheck --check-prefix=SSE4A %s
+// RUN: %clang -target i386-unknown-unknown -march=pentium-m -mxop -mno-avx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE4A %s
 
 // SSE4A: #define __SSE2_MATH__ 1
 // SSE4A: #define __SSE2__ 1
@@ -59,7 +59,7 @@
 // SSE4A: #define __SSE__ 1
 // SSE4A: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512F %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F %s
 
 // AVX512F: #define __AVX2__ 1
 // AVX512F: #define __AVX512F__ 1
@@ -73,7 +73,7 @@
 // AVX512F: #define __SSE__ 1
 // AVX512F: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512CD %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512cd -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512CD %s
 
 // AVX512CD: #define __AVX2__ 1
 // AVX512CD: #define __AVX512CD__ 1
@@ -88,7 +88,7 @@
 // AVX512CD: #define __SSE__ 1
 // AVX512CD: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512ER %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512er -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512ER %s
 
 // AVX512ER: #define __AVX2__ 1
 // AVX512ER: #define __AVX512ER__ 1
@@ -103,7 +103,7 @@
 // AVX512ER: #define __SSE__ 1
 // AVX512ER: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512PF %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512PF %s
 
 // AVX512PF: #define __AVX2__ 1
 // AVX512PF: #define __AVX512F__ 1
@@ -118,7 +118,7 @@
 // AVX512PF: #define __SSE__ 1
 // AVX512PF: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512DQ %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512dq -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512DQ %s
 
 // AVX512DQ: #define __AVX2__ 1
 // AVX512DQ: #define __AVX512DQ__ 1
@@ -133,7 +133,7 @@
 // AVX512DQ: #define __SSE__ 1
 // AVX512DQ: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bw -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512BW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512bw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512BW %s
 
 // AVX512BW: #define __AVX2__ 1
 // AVX512BW: #define __AVX512BW__ 1
@@ -148,7 +148,7 @@
 // AVX512BW: #define __SSE__ 1
 // AVX512BW: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vl -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512VL %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vl -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VL %s
 
 // AVX512VL: #define __AVX2__ 1
 // AVX512VL: #define __AVX512F__ 1
@@ -163,7 +163,7 @@
 // AVX512VL: #define __SSE__ 1
 // AVX512VL: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck --check-prefix=AVX512F2 %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512pf -mno-avx512f -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512F2 %s
 
 // AVX512F2: #define __AVX2__ 1
 // AVX512F2-NOT: #define __AVX512F__ 1
@@ -178,141 +178,171 @@
 // AVX512F2: #define __SSE__ 1
 // AVX512F2: #define __SSSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE42POPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512ifma -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512IFMA %s
+
+// AVX512IFMA: #define __AVX2__ 1
+// AVX512IFMA: #define __AVX512F__ 1
+// AVX512IFMA: #define __AVX512IFMA__ 1
+// AVX512IFMA: #define __AVX__ 1
+// AVX512IFMA: #define __SSE2_MATH__ 1
+// AVX512IFMA: #define __SSE2__ 1
+// AVX512IFMA: #define __SSE3__ 1
+// AVX512IFMA: #define __SSE4_1__ 1
+// AVX512IFMA: #define __SSE4_2__ 1
+// AVX512IFMA: #define __SSE_MATH__ 1
+// AVX512IFMA: #define __SSE__ 1
+// AVX512IFMA: #define __SSSE3__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavx512vbmi -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVX512VBMI %s
+
+// AVX512VBMI: #define __AVX2__ 1
+// AVX512VBMI: #define __AVX512F__ 1
+// AVX512VBMI: #define __AVX512VBMI__ 1
+// AVX512VBMI: #define __AVX__ 1
+// AVX512VBMI: #define __SSE2_MATH__ 1
+// AVX512VBMI: #define __SSE2__ 1
+// AVX512VBMI: #define __SSE3__ 1
+// AVX512VBMI: #define __SSE4_1__ 1
+// AVX512VBMI: #define __SSE4_2__ 1
+// AVX512VBMI: #define __SSE_MATH__ 1
+// AVX512VBMI: #define __SSE__ 1
+// AVX512VBMI: #define __SSSE3__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42POPCNT %s
 
 // SSE42POPCNT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mno-popcnt -msse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=SSE42NOPOPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-popcnt -msse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSE42NOPOPCNT %s
 
 // SSE42NOPOPCNT-NOT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mpopcnt -mno-sse4.2 -x c -E -dM -o - %s | FileCheck --check-prefix=NOSSE42POPCNT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mpopcnt -mno-sse4.2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOSSE42POPCNT %s
 
 // NOSSE42POPCNT: #define __POPCNT__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -x c -E -dM -o - %s | FileCheck --check-prefix=SSEMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSEMMX %s
 
 // SSEMMX: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-sse -x c -E -dM -o - %s | FileCheck --check-prefix=SSENOSSEMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-sse -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSENOSSEMMX %s
 
 // SSENOSSEMMX-NOT: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-mmx -x c -E -dM -o - %s | FileCheck --check-prefix=SSENOMMX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -msse -mno-mmx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SSENOMMX %s
 
 // SSENOMMX-NOT: #define __MMX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -x c -E -dM -o - %s | FileCheck --check-prefix=F16C %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=F16C %s
 
 // F16C: #define __AVX__ 1
 // F16C: #define __F16C__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -mno-avx -x c -E -dM -o - %s | FileCheck --check-prefix=F16CNOAVX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mf16c -mno-avx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=F16CNOAVX %s
 
 // F16CNOAVX-NOT: #define __AVX__ 1
 // F16CNOAVX-NOT: #define __F16C__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -x c -E -dM -o - %s | FileCheck --check-prefix=PCLMUL %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PCLMUL %s
 
 // PCLMUL: #define __PCLMUL__ 1
 // PCLMUL: #define __SSE2__ 1
 // PCLMUL-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=PCLMULNOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mpclmul -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PCLMULNOSSE2 %s
 
 // PCLMULNOSSE2-NOT: #define __PCLMUL__ 1
 // PCLMULNOSSE2-NOT: #define __SSE2__ 1
 // PCLMULNOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -x c -E -dM -o - %s | FileCheck --check-prefix=AES %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AES %s
 
 // AES: #define __AES__ 1
 // AES: #define __SSE2__ 1
 // AES-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=AESNOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -maes -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AESNOSSE2 %s
 
 // AESNOSSE2-NOT: #define __AES__ 1
 // AESNOSSE2-NOT: #define __SSE2__ 1
 // AESNOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -x c -E -dM -o - %s | FileCheck --check-prefix=SHA %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHA %s
 
 // SHA: #define __SHA__ 1
 // SHA: #define __SSE2__ 1
 // SHA-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sha -x c -E -dM -o - %s | FileCheck --check-prefix=SHANOSHA %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sha -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHANOSHA %s
 
 // SHANOSHA-NOT: #define __SHA__ 1
 // SHANOSHA-NOT: #define __SSE2__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sse2 -x c -E -dM -o - %s | FileCheck --check-prefix=SHANOSSE2 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -msha -mno-sse2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=SHANOSSE2 %s
 
 // SHANOSSE2-NOT: #define __SHA__ 1
 // SHANOSSE2-NOT: #define __SSE2__ 1
 // SHANOSSE2-NOT: #define __SSE3__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mtbm -x c -E -dM -o - %s | FileCheck --check-prefix=TBM %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mtbm -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=TBM %s
 
 // TBM: #define __TBM__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=bdver2 -mno-tbm -x c -E -dM -o - %s | FileCheck --check-prefix=NOTBM %s
+// RUN: %clang -target i386-unknown-unknown -march=bdver2 -mno-tbm -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOTBM %s
 
 // NOTBM-NOT: #define __TBM__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mcx16 -x c -E -dM -o - %s | FileCheck --check-prefix=MCX16 %s
+// RUN: %clang -target i386-unknown-unknown -march=pentiumpro -mcx16 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=MCX16 %s
 
 // MCX16: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -x c -E -dM -o - %s | FileCheck --check-prefix=PRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=PRFCHW %s
 
 // PRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=btver2 -mno-prfchw -x c -E -dM -o - %s | FileCheck --check-prefix=NOPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=btver2 -mno-prfchw -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOPRFCHW %s
 
 // NOPRFCHW-NOT: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=3DNOWPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWPRFCHW %s
 
 // 3DNOWPRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=3DNOWNOPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-prfchw -m3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=3DNOWNOPRFCHW %s
 
 // 3DNOWNOPRFCHW-NOT: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck --check-prefix=NO3DNOWPRFCHW %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mprfchw -mno-3dnow -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NO3DNOWPRFCHW %s
 
 // NO3DNOWPRFCHW: #define __PRFCHW__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -madx -x c -E -dM -o - %s | FileCheck --check-prefix=ADX %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -madx -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=ADX %s
 
 // ADX: #define __ADX__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck --check-prefix=RDSEED %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mrdseed -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=RDSEED %s
 
 // RDSEED: #define __RDSEED__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsave -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVE %s
 
 // XSAVE: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVEOPT %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVEOPT %s
 
 // XSAVEOPT: #define __XSAVEOPT__ 1
 // XSAVEOPT: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsavec -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVEC %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsavec -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVEC %s
 
 // XSAVEC: #define __XSAVEC__ 1
 // XSAVEC: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaves -x c -E -dM -o - %s | FileCheck --check-prefix=XSAVES %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaves -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=XSAVES %s
 
 // XSAVES: #define __XSAVES__ 1
 // XSAVES: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck --check-prefix=NOXSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
 
 // NOXSAVE-NOT: #define __XSAVEOPT__ 1
 // NOXSAVE-NOT: #define __XSAVE__ 1
diff --git a/test/Profile/Inputs/profile-summary.proftext b/test/Profile/Inputs/profile-summary.proftext
new file mode 100644
index 0000000000000..c744f7af9d812
--- /dev/null
+++ b/test/Profile/Inputs/profile-summary.proftext
@@ -0,0 +1,26 @@
+begin
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+1
+0
+
+main
+# Func Hash:
+0
+# Num Counters:
+1
+# Counter Values:
+1
+
+end
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+2
+2
+
diff --git a/test/Profile/c-avoid-direct-call.c b/test/Profile/c-avoid-direct-call.c
new file mode 100644
index 0000000000000..cd02e714dbe17
--- /dev/null
+++ b/test/Profile/c-avoid-direct-call.c
@@ -0,0 +1,11 @@
+// Check the value profiling instrinsics emitted by instrumentation.
+
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-avoid-direct-call.c %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling | FileCheck %s
+
+void foo();
+
+int main(void) {
+// CHECK-NOT: call void @__llvm_profile_instrument_target
+  foo(21);
+  return 0;
+}
diff --git a/test/Profile/c-captured.c b/test/Profile/c-captured.c
index e859628eabdef..bae2dcb6ef771 100644
--- a/test/Profile/c-captured.c
+++ b/test/Profile/c-captured.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN -check-prefix=PGOALL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN -check-prefix=PGOALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-captured.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE -check-prefix=PGOALL %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-captured.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE -check-prefix=PGOALL %s
 
 // PGOGEN: @[[DCC:__profc_debug_captured]] = private global [3 x i64] zeroinitializer
 // PGOGEN: @[[CSC:__profc_c_captured.c___captured_stmt]] = private global [2 x i64] zeroinitializer
diff --git a/test/Profile/c-counter-overflows.c b/test/Profile/c-counter-overflows.c
index 18a3d33d3ab9a..5cb32bbc30af4 100644
--- a/test/Profile/c-counter-overflows.c
+++ b/test/Profile/c-counter-overflows.c
@@ -2,7 +2,7 @@
 // truncated.
 
 // RUN: llvm-profdata merge %S/Inputs/c-counter-overflows.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-counter-overflows.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 typedef unsigned long long uint64_t;
 
diff --git a/test/Profile/c-general.c b/test/Profile/c-general.c
index 03631d8c7c698..da3b7f25522c1 100644
--- a/test/Profile/c-general.c
+++ b/test/Profile/c-general.c
@@ -1,12 +1,12 @@
 // Test instrumentation of general constructs in C.
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/c-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%S/Inputs/c-general.profdata.v3 | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v3 | FileCheck -check-prefix=PGOUSE %s
 // Also check compatibility with older profiles.
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instr-use=%S/Inputs/c-general.profdata.v1 | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -check-prefix=PGOUSE %s
 
 // PGOGEN: @[[SLC:__profc_simple_loops]] = private global [4 x i64] zeroinitializer
 // PGOGEN: @[[IFC:__profc_conditionals]] = private global [11 x i64] zeroinitializer
diff --git a/test/Profile/c-generate.c b/test/Profile/c-generate.c
index 8be4e28477a83..454f5dbf59a58 100644
--- a/test/Profile/c-generate.c
+++ b/test/Profile/c-generate.c
@@ -1,9 +1,14 @@
-// Check that the -fprofile-instr-generate= form works.
-// RUN: %clang_cc1 -main-file-name c-generate.c %s -o - -emit-llvm -fprofile-instr-generate=c-generate-test.profraw | FileCheck %s
-
-// CHECK: private constant [24 x i8] c"c-generate-test.profraw\00"
-// CHECK: call void @__llvm_profile_override_default_filename(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @0, i32 0, i32 0))
-// CHECK: declare void @__llvm_profile_override_default_filename(i8*)
+// Check that the -fprofile-instrument-path= form works.
+// RUN: %clang_cc1 -main-file-name c-generate.c %s -o - -emit-llvm -fprofile-instrument=clang -fprofile-instrument-path=c-generate-test.profraw | FileCheck %s --check-prefix=PROF-INSTR-PATH
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=none | FileCheck %s --check-prefix=PROF-INSTR-NONE
+// RUN: not %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=garbage 2>&1 | FileCheck %s --check-prefix=PROF-INSTR-GARBAGE
+//
+// PROF-INSTR-PATH: private constant [24 x i8] c"c-generate-test.profraw\00"
+// PROF-INSTR-PATH: call void @__llvm_profile_override_default_filename(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @0, i32 0, i32 0))
+// PROF-INSTR-PATH: declare void @__llvm_profile_override_default_filename(i8*)
+//
+// PROF-INSTR-NONE-NOT: @__profn_main
+// PROF-INSTR-GARBAGE: invalid PGO instrumentor in argument '-fprofile-instrument=garbage'
 
 int main(void) {
   return 0;
diff --git a/test/Profile/c-indirect-call.c b/test/Profile/c-indirect-call.c
new file mode 100644
index 0000000000000..b0ace37267fd4
--- /dev/null
+++ b/test/Profile/c-indirect-call.c
@@ -0,0 +1,16 @@
+// Check the value profiling instrinsics emitted by instrumentation.
+
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-indirect-call.c %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling | FileCheck %s
+
+void (*foo)(void);
+
+int main(void) {
+// CHECK:  [[REG1:%[0-9]+]] = load void ()*, void ()** @foo, align 8
+// CHECK-NEXT:  [[REG2:%[0-9]+]] = ptrtoint void ()* [[REG1]] to i64
+// CHECK-NEXT:  call void @__llvm_profile_instrument_target(i64 [[REG2]], i8* bitcast ({{.*}}* @__profd_main to i8*), i32 0)
+// CHECK-NEXT:  call void [[REG1]]()
+  foo();
+  return 0;
+}
+
+// CHECK: declare void @__llvm_profile_instrument_target(i64, i8*, i32)
diff --git a/test/Profile/c-linkage-available_externally.c b/test/Profile/c-linkage-available_externally.c
index 8585bf8425b09..8907839cb9adf 100644
--- a/test/Profile/c-linkage-available_externally.c
+++ b/test/Profile/c-linkage-available_externally.c
@@ -1,11 +1,9 @@
-// Make sure instrementation data from available_externally functions doesn't
-// get thrown out.
-// RUN: %clang_cc1 -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
-
-// CHECK: @__profn_foo = linkonce_odr hidden constant [3 x i8] c"foo", section "__DATA,__llvm_prf_names", align 1
+// Make sure instrumentation data from available_externally functions doesn't
+// get thrown out and are emitted with the expected linkage.
+// RUN: %clang_cc1 -O2 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage-available_externally.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @__profc_foo = linkonce_odr hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
-// CHECK: @__profd_foo = linkonce_odr hidden global { i32, i32, i64, i8*, i64*, i8*, i8*, [1 x i16] } { i32 3, i32 1, i64 0, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @__profn_foo, i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_foo, i32 0, i32 0), i8* null, i8* null, [1 x i16] zeroinitializer }, section "__DATA,__llvm_prf_data", align 8
+// CHECK: @__profd_foo = linkonce_odr hidden global {{.*}} i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_foo, i32 0, i32 0){{.*}}, section "__DATA,__llvm_prf_data", align 8
 inline int foo(void) { return 1; }
 
 int main(void) {
diff --git a/test/Profile/c-linkage.c b/test/Profile/c-linkage.c
index e6fbda9c288b6..50ac558fb0761 100644
--- a/test/Profile/c-linkage.c
+++ b/test/Profile/c-linkage.c
@@ -1,10 +1,14 @@
-// Check that the profiling names we create have the linkage we expect
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// Check that the profiling counters and data we create have the linkage we expect
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-linkage.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
-// CHECK: @__profn_foo = private constant [3 x i8] c"foo"
-// CHECK: @__profn_foo_weak = weak hidden constant [8 x i8] c"foo_weak"
-// CHECK: @__profn_main = private constant [4 x i8] c"main"
-// CHECK: @__profn_c_linkage.c_foo_internal = private constant [24 x i8] c"c-linkage.c:foo_internal"
+// CHECK: @__profc_foo = private global
+// CHECK: @__profd_foo = private global
+// CHECK: @__profc_foo_weak = weak hidden global
+// CHECK: @__profd_foo_weak = weak hidden global
+// CHECK: @__profc_main = private global
+// CHECK: @__profd_main = private global
+// CHECK: @__profc_c_linkage.c_foo_internal = private global
+// CHECK: @__profd_c_linkage.c_foo_internal = private global
 
 void foo(void) { }
 
diff --git a/test/Profile/c-outdated-data.c b/test/Profile/c-outdated-data.c
index d0503acdb6eaf..e61ad02d07572 100644
--- a/test/Profile/c-outdated-data.c
+++ b/test/Profile/c-outdated-data.c
@@ -4,7 +4,7 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-outdated-data.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instr-use=%t.profdata -Wprofile-instr-dropped 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-outdated-data.c %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-dropped 2>&1 | FileCheck %s
 // CHECK: warning: profile data may be out of date: of 3 functions, 1 has no data and 1 has mismatched data that will be ignored
 
 void no_usable_data() {
diff --git a/test/Profile/c-unprofiled-blocks.c b/test/Profile/c-unprofiled-blocks.c
index 58bef9e2962d1..a5474001315b1 100644
--- a/test/Profile/c-unprofiled-blocks.c
+++ b/test/Profile/c-unprofiled-blocks.c
@@ -2,7 +2,7 @@
 // runs) shouldn't have any branch weight metadata added.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled-blocks.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled-blocks.c %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
 
 // PGOUSE-LABEL: @never_called(i32 %i)
 int never_called(int i) {
diff --git a/test/Profile/c-unprofiled.c b/test/Profile/c-unprofiled.c
index 275cd2d1458d7..3466079f69bd8 100644
--- a/test/Profile/c-unprofiled.c
+++ b/test/Profile/c-unprofiled.c
@@ -7,7 +7,7 @@
 // doesn't play well with warnings that have no line number.
 
 // RUN: llvm-profdata merge %S/Inputs/c-unprofiled.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instr-use=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-unprofiled.c -I %S/Inputs/ %s -o /dev/null -emit-llvm -fprofile-instrument-use-path=%t.profdata -Wprofile-instr-unprofiled 2>&1 | FileCheck %s
 
 // CHECK: warning: no profile data available for file "c-unprofiled.c"
 
diff --git a/test/Profile/c-unreachable-after-switch.c b/test/Profile/c-unreachable-after-switch.c
index 7d1855db18815..36a75449dbdf4 100644
--- a/test/Profile/c-unreachable-after-switch.c
+++ b/test/Profile/c-unreachable-after-switch.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O3 -triple x86_64-apple-macosx10.10 -main-file-name c-unreachable-after-switch.c %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -O3 -triple x86_64-apple-macosx10.10 -main-file-name c-unreachable-after-switch.c %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 // CHECK: @[[C:__profc_foo]] = private global [3 x i64] zeroinitializer
 
diff --git a/test/Profile/cxx-class.cpp b/test/Profile/cxx-class.cpp
index a53414014449d..dbc9337785e13 100644
--- a/test/Profile/cxx-class.cpp
+++ b/test/Profile/cxx-class.cpp
@@ -1,13 +1,13 @@
 // Tests for instrumentation of C++ methods, constructors, and destructors.
 
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-generate -fno-exceptions -target %itanium_abi_triple > %tgen
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -triple %itanium_abi_triple > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=CTRGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=DTRGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=MTHGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=WRPGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-class.proftext -o %t.profdata
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fno-exceptions -target %itanium_abi_triple > %tuse
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -triple %itanium_abi_triple > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=DTRUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=MTHUSE %s
diff --git a/test/Profile/cxx-implicit.cpp b/test/Profile/cxx-implicit.cpp
index b25486ae4c0d3..40598bf0fa2a4 100644
--- a/test/Profile/cxx-implicit.cpp
+++ b/test/Profile/cxx-implicit.cpp
@@ -1,17 +1,51 @@
 // Ensure that implicit methods aren't instrumented.
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-implicit.cpp -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple %itanium_abi_triple -main-file-name cxx-implicit.cpp -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
-// An implicit constructor is generated for Base. We should not emit counters
-// for it.
+// Implicit constructors are generated for Base. We should not emit counters
+// for them.
+// CHECK-DAG: define {{.*}}_ZN4BaseC2Ev
+// CHECK-DAG: define {{.*}}_ZN4BaseC2ERKS_
+// CHECK-DAG: define {{.*}}_ZN4BaseC2EOS_
+// CHECK-DAG: __profc__ZN7DerivedC2Ev,
+// CHECK-DAG: __profc__ZN7DerivedC2ERKS_
+// CHECK-DAG: __profc__ZN7DerivedC2EOS_
 // CHECK-NOT: @__profc__ZN4BaseC2Ev =
+// CHECK-NOT: @__profc__ZN4BaseC2ERKS_
+// CHECK-NOT: @__profc__ZN4BaseC2EOS_
+//
+// Implicit assignment operators are generated for Base. We should not emit counters
+// for them.
+// CHECK-NOT: @__profc__ZN4BaseaSEOS_
+// CHECK-NOT: @__profc__ZN4BaseaSERKS_
 
-struct Base {
+struct BaseBase {
+ BaseBase();
+ BaseBase(const BaseBase &);
+ BaseBase &operator=(const BaseBase &);
+ BaseBase &operator=(BaseBase &&);
+};
+
+struct Base : public BaseBase {
   virtual void foo();
 };
 
 struct Derived : public Base {
   Derived();
+  Derived(const Derived &);
+  Derived(Derived &&);
+  Derived &operator=(const Derived &);
+  Derived &operator=(Derived &&);
 };
 
 Derived::Derived() {}
+Derived::Derived(const Derived &d) : Base(d) {}
+Derived::Derived(Derived &&d) : Base(static_cast<Base&&>(d)) {}
+Derived& Derived::operator=(const Derived &d) {
+  Base::operator=(d);
+  return *this;
+}
+Derived& Derived::operator=(Derived &&d) {
+  Base::operator=(static_cast<Base &&>(d));
+  return *this;
+}
diff --git a/test/Profile/cxx-indirect-call.cpp b/test/Profile/cxx-indirect-call.cpp
new file mode 100644
index 0000000000000..f95d1af37c79d
--- /dev/null
+++ b/test/Profile/cxx-indirect-call.cpp
@@ -0,0 +1,21 @@
+// Check the value profiling instrinsics emitted by instrumentation.
+
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -mllvm -enable-value-profiling -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck %s
+
+void (*foo) (void);
+
+int main(int argc, const char *argv[]) {
+// CHECK:  [[REG1:%[0-9]+]] = load void ()*, void ()** @foo
+// CHECK-NEXT:  [[REG2:%[0-9]+]] = ptrtoint void ()* [[REG1]] to i64
+// CHECK-NEXT:  call void @__llvm_profile_instrument_target(i64 [[REG2]], i8* bitcast ({{.*}}* @__profd_main to i8*), i32 0)
+// CHECK-NEXT:  invoke void [[REG1]]()
+  try {
+    foo();
+  } catch (int) {}
+  return 0;
+}
+
+// CHECK: declare void @__llvm_profile_instrument_target(i64, i8*, i32)
+
+
+
diff --git a/test/Profile/cxx-lambda.cpp b/test/Profile/cxx-lambda.cpp
index 26314c892e44e..2b422912f45b1 100644
--- a/test/Profile/cxx-lambda.cpp
+++ b/test/Profile/cxx-lambda.cpp
@@ -1,11 +1,11 @@
 // Tests for instrumentation of C++11 lambdas
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=PGOGEN %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=LMBGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-lambda.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-lambda.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=PGOUSE %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=LMBUSE %s
 
diff --git a/test/Profile/cxx-linkage.cpp b/test/Profile/cxx-linkage.cpp
index 701a88028154d..6f7b2b7128e9f 100644
--- a/test/Profile/cxx-linkage.cpp
+++ b/test/Profile/cxx-linkage.cpp
@@ -1,9 +1,13 @@
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -emit-llvm -main-file-name cxx-linkage.cpp %s -o - -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9.0 -emit-llvm -main-file-name cxx-linkage.cpp %s -o - -fprofile-instrument=clang | FileCheck %s
 
-// CHECK: @__profn__Z3foov = private constant [7 x i8] c"_Z3foov"
-// CHECK: @__profn__Z8foo_weakv = weak hidden constant [12 x i8] c"_Z8foo_weakv"
-// CHECK: @__profn_main = private constant [4 x i8] c"main"
-// CHECK: @__profn__Z10foo_inlinev = linkonce_odr hidden constant [15 x i8] c"_Z10foo_inlinev"
+// CHECK: @__profc__Z3foov = private global
+// CHECK: @__profd__Z3foov = private global
+// CHECK: @__profc__Z8foo_weakv = weak hidden global
+// CHECK: @__profd__Z8foo_weakv = weak hidden global
+// CHECK: @__profc_main = private global
+// CHECK: @__profd_main = private global
+// CHECK: @__profc__Z10foo_inlinev = linkonce_odr hidden global
+// CHECK: @__profd__Z10foo_inlinev = linkonce_odr hidden global
 
 void foo(void) { }
 
diff --git a/test/Profile/cxx-rangefor.cpp b/test/Profile/cxx-rangefor.cpp
index 1007a7077552a..a61557a9619cf 100644
--- a/test/Profile/cxx-rangefor.cpp
+++ b/test/Profile/cxx-rangefor.cpp
@@ -1,10 +1,10 @@
 // Tests for instrumentation of C++11 range-for
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=CHECK -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-rangefor.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-rangefor.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=CHECK -check-prefix=PGOUSE %s
 
 // PGOGEN: @[[RFC:__profc__Z9range_forv]] = private global [5 x i64] zeroinitializer
diff --git a/test/Profile/cxx-structors.cpp b/test/Profile/cxx-structors.cpp
index 183df9225783c..73562d39c973f 100644
--- a/test/Profile/cxx-structors.cpp
+++ b/test/Profile/cxx-structors.cpp
@@ -1,6 +1,6 @@
 // Tests for instrumentation of C++ constructors and destructors.
 //
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -x c++ %s -o - -emit-llvm -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -x c++ %s -o - -emit-llvm -fprofile-instrument=clang | FileCheck %s
 
 struct Foo {
   Foo() {}
diff --git a/test/Profile/cxx-templates.cpp b/test/Profile/cxx-templates.cpp
index c24bae39c3af0..1cec605e5066b 100644
--- a/test/Profile/cxx-templates.cpp
+++ b/test/Profile/cxx-templates.cpp
@@ -1,12 +1,12 @@
 // Tests for instrumentation of templated code. Each instantiation of a template
 // should be instrumented separately.
 
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-generate > %tgen
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument=clang > %tgen
 // RUN: FileCheck --input-file=%tgen -check-prefix=T0GEN -check-prefix=ALL %s
 // RUN: FileCheck --input-file=%tgen -check-prefix=T100GEN -check-prefix=ALL %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-templates.proftext -o %t.profdata
-// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instr-use=%t.profdata > %tuse
+// RUN: %clang_cc1 -x c++ %s -triple %itanium_abi_triple -main-file-name cxx-templates.cpp -std=c++11 -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata > %tuse
 // RUN: FileCheck --input-file=%tuse -check-prefix=T0USE -check-prefix=ALL %s
 // RUN: FileCheck --input-file=%tuse -check-prefix=T100USE -check-prefix=ALL %s
 
diff --git a/test/Profile/cxx-throws.cpp b/test/Profile/cxx-throws.cpp
index 6b33416cdbb44..ef56c8b28801e 100644
--- a/test/Profile/cxx-throws.cpp
+++ b/test/Profile/cxx-throws.cpp
@@ -3,12 +3,12 @@
 // FIXME: Don't seek bb labels, like "if.else"
 // REQUIRES: asserts
 
-// RUN: %clangxx %s -o - -emit-llvm -S -fprofile-instr-generate -fexceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOGEN %s
-// RUN: %clangxx %s -o - -emit-llvm -S -fprofile-instr-generate -fexceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOGEN-EXC %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument=clang -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOGEN-EXC %s
 
 // RUN: llvm-profdata merge %S/Inputs/cxx-throws.proftext -o %t.profdata
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fcxx-exceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
-// RUN: %clang %s -o - -emit-llvm -S -fprofile-instr-use=%t.profdata -fcxx-exceptions -target %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 %s -o - -emit-llvm -fprofile-instrument-use-path=%t.profdata -fexceptions -fcxx-exceptions -triple %itanium_abi_triple | FileCheck -check-prefix=PGOUSE-EXC %s
 
 // PGOGEN: @[[THC:__profc__Z6throwsv]] = private global [9 x i64] zeroinitializer
 // PGOGEN-EXC: @[[THC:__profc__Z6throwsv]] = private global [9 x i64] zeroinitializer
diff --git a/test/Profile/cxx-virtual-destructor-calls.cpp b/test/Profile/cxx-virtual-destructor-calls.cpp
index 4affd2618e21a..cc3df68d3569a 100644
--- a/test/Profile/cxx-virtual-destructor-calls.cpp
+++ b/test/Profile/cxx-virtual-destructor-calls.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -main-file-name cxx-virtual-destructor-calls.cpp %s -o - -fprofile-instr-generate | FileCheck %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -main-file-name cxx-virtual-destructor-calls.cpp %s -o - -fprofile-instrument=clang | FileCheck %s
 
 struct Member {
   ~Member();
diff --git a/test/Profile/def-assignop.cpp b/test/Profile/def-assignop.cpp
new file mode 100644
index 0000000000000..2d453364a55d4
--- /dev/null
+++ b/test/Profile/def-assignop.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instrument=clang | FileCheck --check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-assignop.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+
+struct B {
+  B& operator=(const B &b);
+  B& operator=(const B &&b);
+};
+
+struct A {
+  A &operator=(const A &) = default;
+  // PGOGEN: define {{.*}}@_ZN1AaSERKS_(
+  // PGOGEN: %pgocount = load {{.*}} @__profc__ZN1AaSERKS_
+  // PGOGEN: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN: store{{.*}}@__profc__ZN1AaSERKS_
+  A &operator=(A &&) = default;
+  // PGOGEN: define {{.*}}@_ZN1AaSEOS_
+  // PGOGEN: %pgocount = load {{.*}} @__profc__ZN1AaSEOS_
+  // PGOGEN: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN: store{{.*}}@__profc__ZN1AaSEOS_
+
+  // Check that coverage mapping includes 6 function records including the
+  // defaulted copy and move operators: A::operator=
+  // COVMAP: @__llvm_coverage_mapping = {{.*}} { { i32, i32, i32, i32 }, [3 x <{{.*}}>],
+  B b;
+};
+
+A a1, a2;
+void foo() {
+  a1 = a2;
+  a2 = static_cast<A &&>(a1);
+}
diff --git a/test/Profile/def-ctors.cpp b/test/Profile/def-ctors.cpp
new file mode 100644
index 0000000000000..1b52d559e0113
--- /dev/null
+++ b/test/Profile/def-ctors.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu  -main-file-name def-ctors.cpp -o - -emit-llvm -fprofile-instrument=clang |  FileCheck --check-prefix=PGOGEN %s
+
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-ctors.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+
+struct Base {
+  int B;
+  Base() : B(2) {}
+  Base(const struct Base &b2) {}
+};
+
+struct Derived : public Base {
+  Derived(const Derived &) = default;
+  // PGOGEN-DAG: define {{.*}}@_ZN7DerivedC2ERKS_
+  // PGOGEN-DAG: %pgocount = load {{.*}} @__profc__ZN7DerivedC2ERKS_
+  // PGOGEN-DAG: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN-DAG: store{{.*}}@__profc__ZN7DerivedC2ERKS_
+  Derived() = default;
+  // PGOGEN-DAG: define {{.*}}@_ZN7DerivedC2Ev
+  // PGOGEN-DAG: %pgocount = load {{.*}} @__profc__ZN7DerivedC2Ev
+  // PGOGEN-DAG: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN-DAG: store{{.*}}@__profc__ZN7DerivedC2Ev
+
+  // Check that coverage mapping has 6 function records including
+  // the defaulted Derived::Derived(const Derived), and Derived::Derived()
+  // methds.
+  // COVMAP: @__llvm_coverage_mapping = {{.*}} { { i32, i32, i32, i32 }, [5 x
+  // <{{.*}}>],
+};
+
+Derived dd;
+int g;
+int main() {
+  Derived dd2(dd);
+  g = dd2.B;
+  return 0;
+}
diff --git a/test/Profile/def-dtors.cpp b/test/Profile/def-dtors.cpp
new file mode 100644
index 0000000000000..bfa535634d1ef
--- /dev/null
+++ b/test/Profile/def-dtors.cpp
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-dtors.cpp -o - -emit-llvm -fprofile-instrument=clang  | FileCheck --check-prefix=PGOGEN %s
+
+// RUN: %clang_cc1 -x c++ -std=c++11 %s -triple x86_64-unknown-linux-gnu -main-file-name def-dtors.cpp -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck --check-prefix=COVMAP %s
+
+struct Base {
+  int B;
+  Base(int B_) : B(B_) {}
+  ~Base() {}
+};
+
+struct Derived : public Base {
+  Derived(int K) : Base(K) {}
+  ~Derived() = default;
+  // PGOGEN-LABEL: define {{.*}}@_ZN7DerivedD2Ev
+  // PGOGEN: %pgocount = load {{.*}} @__profc__ZN7DerivedD2Ev
+  // PGOGEN: {{.*}}add{{.*}}%pgocount, 1
+  // PGOGEN: store{{.*}}@__profc__ZN7DerivedD2Ev
+
+  // Check that coverage mapping has 6 function records including
+  // the default destructor in the derived class.
+  // COVMAP: @__llvm_coverage_mapping = {{.*}} { { i32, i32, i32, i32 }, [5 x
+  // <{{.*}}>],
+};
+
+int main() {
+  Derived dd2(10);
+  if (dd2.B != 10)
+    return 1;
+  return 0;
+}
diff --git a/test/Profile/func-entry.c b/test/Profile/func-entry.c
index 1ecae601a4d97..430ccb3806c4d 100644
--- a/test/Profile/func-entry.c
+++ b/test/Profile/func-entry.c
@@ -1,7 +1,7 @@
 // Test that function entry counts are set correctly.
 
 // RUN: llvm-profdata merge %S/Inputs/func-entry.proftext -o %t.profdata
-// RUN: %clang %s -o - -mllvm -disable-llvm-optzns -emit-llvm -S -fprofile-instr-use=%t.profdata | FileCheck %s
+// RUN: %clang_cc1 %s -o - -disable-llvm-optzns -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
 
 void foo(void);
 
diff --git a/test/Profile/gcc-flag-compatibility.c b/test/Profile/gcc-flag-compatibility.c
index 679a72221c614..622b782c0cb37 100644
--- a/test/Profile/gcc-flag-compatibility.c
+++ b/test/Profile/gcc-flag-compatibility.c
@@ -7,10 +7,11 @@
 // -fprofile-use=<dir>        Uses the profile file <dir>/default.profdata
 // -fprofile-use=<dir>/file   Uses the profile file <dir>/file
 
-// Check that -fprofile-generate uses the runtime default profile file.
+// FIXME: IRPGO shouldn't use the override API when no profraw name is given.
+// Check that -fprofile-generate overrides the default profraw.
 // RUN: %clang %s -c -S -o - -emit-llvm -fprofile-generate | FileCheck -check-prefix=PROFILE-GEN %s
-// PROFILE-GEN-NOT: call void @__llvm_profile_override_default_filename
-// PROFILE-GEN-NOT: declare void @__llvm_profile_override_default_filename(i8*)
+// PROFILE-GEN: call void @__llvm_profile_override_default_filename
+// PROFILE-GEN: declare void @__llvm_profile_override_default_filename(i8*)
 
 // Check that -fprofile-generate=/path/to generates /path/to/default.profraw
 // RUN: %clang %s -c -S -o - -emit-llvm -fprofile-generate=/path/to | FileCheck -check-prefix=PROFILE-GEN-EQ %s
diff --git a/test/Profile/max-function-count.c b/test/Profile/max-function-count.c
deleted file mode 100644
index 39490d7b276ec..0000000000000
--- a/test/Profile/max-function-count.c
+++ /dev/null
@@ -1,24 +0,0 @@
-// Test that maximum function counts are set correctly.
-
-// RUN: llvm-profdata merge %S/Inputs/max-function-count.proftext -o %t.profdata
-// RUN: %clang %s -o - -mllvm -disable-llvm-optzns -emit-llvm -S -fprofile-instr-use=%t.profdata | FileCheck %s
-//
-int begin(int i) {
-  if (i)
-    return 0;
-  return 1;
-}
-
-int end(int i) {
-  if (i)
-    return 0;
-  return 1;
-}
-
-int main(int argc, const char *argv[]) {
-  begin(0);
-  end(1);
-  end(1);
-  return 0;
-}
-// CHECK: !{{[0-9]+}} = !{i32 1, !"MaxFunctionCount", i32 2}
diff --git a/test/Profile/objc-general.m b/test/Profile/objc-general.m
index b6435af78b7b1..b679627a48e8a 100644
--- a/test/Profile/objc-general.m
+++ b/test/Profile/objc-general.m
@@ -1,9 +1,9 @@
 // Test instrumentation of general constructs in objective C.
 
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instr-generate | FileCheck -check-prefix=PGOGEN %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument=clang | FileCheck -check-prefix=PGOGEN %s
 
 // RUN: llvm-profdata merge %S/Inputs/objc-general.proftext -o %t.profdata
-// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instr-use=%t.profdata | FileCheck -check-prefix=PGOUSE %s
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name objc-general.m %s -o - -emit-llvm -fblocks -fprofile-instrument-use-path=%t.profdata | FileCheck -check-prefix=PGOUSE %s
 
 #ifdef HAVE_FOUNDATION
 
diff --git a/test/Profile/profile-does-not-exist.c b/test/Profile/profile-does-not-exist.c
index d45981faffed8..5725f76eb80cd 100644
--- a/test/Profile/profile-does-not-exist.c
+++ b/test/Profile/profile-does-not-exist.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instr-use=%t.nonexistent.profdata 2>&1 | FileCheck %s
+// RUN: not %clang_cc1 -emit-llvm %s -o - -fprofile-instrument-use-path=%t.nonexistent.profdata 2>&1 | FileCheck %s
 
 // CHECK: error: Could not read profile {{.*}}.nonexistent.profdata:
 // CHECK-NOT: Assertion failed
diff --git a/test/Profile/profile-summary.c b/test/Profile/profile-summary.c
new file mode 100644
index 0000000000000..dc3112c9d672f
--- /dev/null
+++ b/test/Profile/profile-summary.c
@@ -0,0 +1,25 @@
+// Test that profile summary is set correctly.
+
+// RUN: llvm-profdata merge %S/Inputs/max-function-count.proftext -o %t.profdata
+// RUN: %clang_cc1 %s -o - -disable-llvm-optzns -emit-llvm -fprofile-instrument-use-path=%t.profdata | FileCheck %s
+//
+int begin(int i) {
+  if (i)
+    return 0;
+  return 1;
+}
+
+int end(int i) {
+  if (i)
+    return 0;
+  return 1;
+}
+
+int main(int argc, const char *argv[]) {
+  begin(0);
+  end(1);
+  end(1);
+  return 0;
+}
+// CHECK: {{![0-9]+}} = !{i32 1, !"ProfileSummary", {{![0-9]+}}}
+// CHECK: {{![0-9]+}} = !{!"DetailedSummary", {{![0-9]+}}}
diff --git a/test/Sema/128bitfloat.cpp b/test/Sema/128bitfloat.cpp
index cb76dac96001d..2449cb6b6596f 100644
--- a/test/Sema/128bitfloat.cpp
+++ b/test/Sema/128bitfloat.cpp
@@ -1,24 +1,35 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 %s
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
+#ifdef __FLOAT128__
+__float128 f;
+template<typename> struct __is_floating_point_helper {};
+template<> struct __is_floating_point_helper<__float128> {};
+int g(int x, __float128 *y) {
+  return x + *y;
+}
+
+// expected-no-diagnostics
+#else
 #if !defined(__STRICT_ANSI__)
-__float128 f;  // expected-error {{support for type '__float128' is not yet implemented}}
+__float128 f;  // expected-error {{__float128 is not supported on this target}}
 // But this should work:
 template<typename> struct __is_floating_point_helper {};
-template<> struct __is_floating_point_helper<__float128> {};
+template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{__float128 is not supported on this target}}
 
 // FIXME: This could have a better diag.
-void g(int x, __float128 *y) {
-  x + *y;  // expected-error {{invalid operands to binary expression ('int' and '__float128')}}
+int g(int x, __float128 *y) {  // expected-error {{__float128 is not supported on this target}}
+  return x + *y;
 }
 
 #else
-__float128 f;  // expected-error {{unknown type name '__float128'}}
+__float128 f;  // expected-error {{__float128 is not supported on this target}}
 template<typename> struct __is_floating_point_helper {};
-template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{use of undeclared identifier '__float128'}}
+template<> struct __is_floating_point_helper<__float128> {};  // expected-error {{__float128 is not supported on this target}}
 
-void g(int x, __float128 *y) {  // expected-error {{unknown type name '__float128'}}
-  x + *y;
+int g(int x, __float128 *y) {  // expected-error {{__float128 is not supported on this target}}
+  return x + *y;
 }
 
 #endif
+#endif
diff --git a/test/Sema/MicrosoftExtensions.c b/test/Sema/MicrosoftExtensions.c
index e7032305fc07f..62e5285970a7e 100644
--- a/test/Sema/MicrosoftExtensions.c
+++ b/test/Sema/MicrosoftExtensions.c
@@ -6,6 +6,12 @@ struct A
    int a[];  /* expected-warning {{flexible array member 'a' in otherwise empty struct is a Microsoft extension}} */
 };
 
+struct PR28407
+{
+  int : 1;
+  int a[]; /* expected-warning {{flexible array member 'a' in otherwise empty struct is a Microsoft extension}} */
+};
+
 struct C {
    int l;
    union {
@@ -170,3 +176,13 @@ void myprintf(const char *f, ...) {
     __va_start(ap, f); // expected-warning {{incompatible pointer types passing 'my_va_list'}}
   }
 }
+
+// __unaligned handling
+void test_unaligned() {
+  __unaligned int *p1 = 0;
+  int *p2 = p1; // expected-warning {{initializing 'int *' with an expression of type '__unaligned int *' discards qualifiers}}
+  __unaligned int *p3 = p2;
+}
+
+void test_unaligned2(int x[__unaligned 4]) {}
+
diff --git a/test/Sema/aarch64-special-register.c b/test/Sema/aarch64-special-register.c
index 40d4033967f87..a4fb92b5235b3 100644
--- a/test/Sema/aarch64-special-register.c
+++ b/test/Sema/aarch64-special-register.c
@@ -13,7 +13,7 @@ void wsrp_1(void *v) {
 }
 
 void wsr64_1(unsigned long v) {
-  __builtin_arm_wsr64("sysreg", v); //expected-error {{invalid special register for builtin}}
+  __builtin_arm_wsr64("sysreg", v);
 }
 
 unsigned rsr_1() {
@@ -25,7 +25,7 @@ void *rsrp_1() {
 }
 
 unsigned long rsr64_1() {
-  return __builtin_arm_rsr64("sysreg"); //expected-error {{invalid special register for builtin}}
+  return __builtin_arm_rsr64("sysreg");
 }
 
 void wsr_2(unsigned v) {
diff --git a/test/Sema/address_spaces.c b/test/Sema/address_spaces.c
index 1922c8ae4f6fd..3fe93155451fd 100644
--- a/test/Sema/address_spaces.c
+++ b/test/Sema/address_spaces.c
@@ -20,7 +20,7 @@ void foo(_AS3 float *a,
   _AS1 int arrarr[5][5]; // expected-error {{automatic variable qualified with an address space}}
 
   __attribute__((address_space(-1))) int *_boundsA; // expected-error {{address space is negative}}
-  __attribute__((address_space(0xFFFFFF))) int *_boundsB;
+  __attribute__((address_space(0x7FFFFF))) int *_boundsB;
   __attribute__((address_space(0x1000000))) int *_boundsC; // expected-error {{address space is larger than the maximum supported}}
   // chosen specifically to overflow 32 bits and come out reasonable
   __attribute__((address_space(4294967500))) int *_boundsD; // expected-error {{address space is larger than the maximum supported}}
@@ -71,4 +71,4 @@ __attribute__((address_space("12"))) int *i; // expected-error {{'address_space'
 // Clang extension doesn't forbid operations on pointers to different address spaces.
 char* cmp(_AS1 char *x,  _AS2 char *y) {
   return x < y ? x : y; // expected-warning {{pointer type mismatch ('__attribute__((address_space(1))) char *' and '__attribute__((address_space(2))) char *')}}
-}
-\ No newline at end of file
+}
diff --git a/test/Sema/arm-no-fp16.c b/test/Sema/arm-no-fp16.c
new file mode 100644
index 0000000000000..6443d83198ce8
--- /dev/null
+++ b/test/Sema/arm-no-fp16.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple thumbv7-none-eabi %s -target-feature +neon -target-feature -fp16 -fsyntax-only -verify
+
+#include <arm_neon.h>
+
+float16x4_t test_vcvt_f16_f32(float32x4_t a) {
+  return vcvt_f16_f32(a); // expected-warning{{implicit declaration of function 'vcvt_f16_f32'}}  expected-error{{returning 'int' from a function with incompatible result type 'float16x4_t'}}
+}
+
+float32x4_t test_vcvt_f32_f16(float16x4_t a) {
+  return vcvt_f32_f16(a); // expected-warning{{implicit declaration of function 'vcvt_f32_f16'}} expected-error{{returning 'int' from a function with incompatible result type 'float32x4_t'}}
+}
diff --git a/test/Sema/arm64-neon-header.c b/test/Sema/arm64-neon-header.c
new file mode 100644
index 0000000000000..0ae082113a575
--- /dev/null
+++ b/test/Sema/arm64-neon-header.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple arm64-apple-darwin -target-feature +neon -Wvector-conversion -fsyntax-only -ffreestanding -verify %s
+
+#include <arm_neon.h>
+
+int16x8_t foo(int8x8_t p0, int16x8_t p1) {
+  return vqmovun_high_s16(p0, p1); // expected-warning {{incompatible vector types returning 'uint8x16_t'}}
+}
diff --git a/test/Sema/arm_vfma.c b/test/Sema/arm_vfma.c
new file mode 100644
index 0000000000000..8c08b4d8b9e7d
--- /dev/null
+++ b/test/Sema/arm_vfma.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple thumbv7-none-eabi -target-feature +neon -target-feature +vfp4 -fsyntax-only -verify %s
+#include <arm_neon.h>
+
+// expected-no-diagnostics
+
+void func(float32x2_t v2f32, float32x4_t v4f32) {
+  vfma_f32(v2f32, v2f32, v2f32);
+  vfmaq_f32(v4f32, v4f32, v4f32);
+
+  vfms_f32(v2f32, v2f32, v2f32);
+  vfmsq_f32(v4f32, v4f32, v4f32);
+}
diff --git a/test/Sema/asm.c b/test/Sema/asm.c
index d29b136a117ac..69c33f7ccf247 100644
--- a/test/Sema/asm.c
+++ b/test/Sema/asm.c
@@ -25,7 +25,7 @@ void clobbers() {
   asm ("nop" : : : "0", "%0", "#0");
   asm ("nop" : : : "foo"); // expected-error {{unknown register name 'foo' in asm}}
   asm ("nop" : : : "52");
-  asm ("nop" : : : "104"); // expected-error {{unknown register name '104' in asm}}
+  asm ("nop" : : : "204"); // expected-error {{unknown register name '204' in asm}}
   asm ("nop" : : : "-1"); // expected-error {{unknown register name '-1' in asm}}
   asm ("nop" : : : "+1"); // expected-error {{unknown register name '+1' in asm}}
 }
diff --git a/test/Sema/ast-print.c b/test/Sema/ast-print.c
index b4d76844fef78..4c0aef5b2f3fc 100644
--- a/test/Sema/ast-print.c
+++ b/test/Sema/ast-print.c
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 %s -ast-print | FileCheck %s
+// RUN: %clang_cc1 %s -ast-print | %clang_cc1 -fsyntax-only -
 
 typedef void func_typedef();
 func_typedef xxx;
@@ -39,6 +40,7 @@ int rvarr(int n, int a[restrict static n]) {
   return a[2];
 }
 
+// CHECK: typedef struct {
 typedef struct {
   int f;
 } T __attribute__ ((__aligned__));
@@ -53,3 +55,13 @@ struct pair_t {
 
 // CHECK: struct pair_t p = {a: 3, .b = 4};
 struct pair_t p = {a: 3, .b = 4};
+
+void initializers() {
+  // CHECK: int *x = ((void *)0), *y = ((void *)0);
+  int *x = ((void *)0), *y = ((void *)0);
+  struct Z{};
+  struct {
+    struct Z z;
+  // CHECK: } z = {(struct Z){}};
+  } z = {(struct Z){}};
+}
diff --git a/test/Sema/atomic-ops.c b/test/Sema/atomic-ops.c
index 9a37ec2a38763..05836214247c8 100644
--- a/test/Sema/atomic-ops.c
+++ b/test/Sema/atomic-ops.c
@@ -121,7 +121,10 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
   __atomic_load(I, *P, memory_order_relaxed, 42); // expected-error {{too many arguments}}
   (int)__atomic_load(I, I, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
   __atomic_load(s1, s2, memory_order_acquire);
-  (void)__atomic_load(I, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+  __atomic_load(CI, I, memory_order_relaxed);
+  __atomic_load(I, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+  __atomic_load(CI, CI, memory_order_relaxed); // expected-warning {{passing 'const int *' to parameter of type 'int *' discards qualifiers}}
+
   __c11_atomic_store(i, 1, memory_order_seq_cst);
   __c11_atomic_store(p, 1, memory_order_seq_cst); // expected-warning {{incompatible integer to pointer conversion}}
   (int)__c11_atomic_store(d, 1, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
diff --git a/test/Sema/attr-alias-elf.c b/test/Sema/attr-alias-elf.c
index f14514dccd2c6..e56f23e4fa51c 100644
--- a/test/Sema/attr-alias-elf.c
+++ b/test/Sema/attr-alias-elf.c
@@ -55,7 +55,7 @@ typedef int b4;
 
 void test2_bar() {}
 void test2_foo() __attribute__((weak, alias("test2_bar")));
-void test2_zed() __attribute__((alias("test2_foo"))); // expected-warning {{alias will always resolve to test2_bar even if weak definition of alias test2_foo is overridden}}
+void test2_zed() __attribute__((alias("test2_foo"))); // expected-warning {{alias will always resolve to test2_bar even if weak definition of test2_foo is overridden}}
 
 void test3_bar() { }
 void test3_foo() __attribute__((section("test"))); // expected-warning {{alias will not be in section 'test' but in the same section as the aliasee}}
diff --git a/test/Sema/attr-aligned.c b/test/Sema/attr-aligned.c
index 0a2698ec91afd..b8d2fc6863d27 100644
--- a/test/Sema/attr-aligned.c
+++ b/test/Sema/attr-aligned.c
@@ -3,6 +3,9 @@
 int x __attribute__((aligned(3))); // expected-error {{requested alignment is not a power of 2}}
 int y __attribute__((aligned(1 << 29))); // expected-error {{requested alignment must be 268435456 bytes or smaller}}
 
+// PR26444
+int y __attribute__((aligned(1 << 28)));
+
 // PR3254
 short g0[3] __attribute__((aligned));
 short g0_chk[__alignof__(g0) == 16 ? 1 : -1]; 
diff --git a/test/Sema/attr-availability-macosx.c b/test/Sema/attr-availability-macosx.c
index 38f149ba62aed..f422811708fa8 100644
--- a/test/Sema/attr-availability-macosx.c
+++ b/test/Sema/attr-availability-macosx.c
@@ -1,21 +1,37 @@
 // RUN: %clang_cc1 "-triple" "x86_64-apple-darwin9.0.0" -fsyntax-only -verify %s
 
+#if !__has_feature(attribute_availability_with_strict)
+#error "Missing __has_feature"
+#endif
+
 void f0(int) __attribute__((availability(macosx,introduced=10.4,deprecated=10.6)));
 void f1(int) __attribute__((availability(macosx,introduced=10.5)));
 void f2(int) __attribute__((availability(macosx,introduced=10.4,deprecated=10.5))); // expected-note {{'f2' has been explicitly marked deprecated here}}
 void f3(int) __attribute__((availability(macosx,introduced=10.6)));
 void f4(int) __attribute__((availability(macosx,introduced=10.1,deprecated=10.3,obsoleted=10.5), availability(ios,introduced=2.0,deprecated=3.0))); // expected-note{{explicitly marked unavailable}}
 void f5(int) __attribute__((availability(ios,introduced=3.2), availability(macosx,unavailable))); // expected-note{{'f5' has been explicitly marked unavailable here}}
+void f6(int) __attribute__((availability(macosx,strict,introduced=10.6))); //expected-note{{'f6' has been explicitly marked unavailable here}}
 
 void test() {
   f0(0);
   f1(0);
-  f2(0); // expected-warning{{'f2' is deprecated: first deprecated in OS X 10.5}}
+  f2(0); // expected-warning{{'f2' is deprecated: first deprecated in macOS 10.5}}
   f3(0);
-  f4(0); // expected-error{{f4' is unavailable: obsoleted in OS X 10.5}}
-  f5(0); // expected-error{{'f5' is unavailable: not available on OS X}}
+  f4(0); // expected-error{{f4' is unavailable: obsoleted in macOS 10.5}}
+  f5(0); // expected-error{{'f5' is unavailable: not available on macOS}}
+  f6(0); // expected-error{{'f6' is unavailable: introduced in macOS 10.6}}
 }
 
+struct __attribute__((availability(macosx,strict,introduced=10.6)))
+  not_yet_introduced_struct; // \
+    expected-note{{'not_yet_introduced_struct' has been explicitly marked unavailable here}}
+
+void uses_not_introduced_struct(struct not_yet_introduced_struct *); // \
+    expected-error{{'not_yet_introduced_struct' is unavailable: introduced in macOS 10.6}}
+
+__attribute__((availability(macosx,strict,introduced=10.6)))
+void uses_not_introduced_struct_same_availability(struct not_yet_introduced_struct *);
+
 // rdar://10535640
 
 enum {
@@ -29,3 +45,14 @@ enum {
 enum __attribute__((availability(macosx,introduced=8.0,deprecated=9.0))) {
     bar1 = foo
 };
+
+// Make sure the note is on the declaration with the actual availability attributes.
+struct __attribute__((availability(macosx,strict,introduced=10.9))) type_info // \
+    expected-note{{'type_info' has been explicitly marked unavailable here}}
+{
+};
+struct type_info;
+int test2() {
+  struct type_info *t; // expected-error{{'type_info' is unavailable: introduced in macOS 10.9}}
+  return 0;
+}
diff --git a/test/Sema/attr-availability-tvos.c b/test/Sema/attr-availability-tvos.c
index b60fdb0d19cca..642246425b373 100644
--- a/test/Sema/attr-availability-tvos.c
+++ b/test/Sema/attr-availability-tvos.c
@@ -27,6 +27,12 @@ void test_transcribed_availability() {
   f9(0);
 }
 
+__attribute__((availability(ios,introduced=9_0,deprecated=9_0,message="" ))) // expected-note{{previous attribute is here}} \
+  // expected-note{{previous attribute is here}}
+__attribute__((availability(ios,introduced=7_0))) // expected-warning{{availability does not match previous declaration}} \
+  // expected-warning{{availability does not match previous declaration}}
+void f10(int);
+
 // Test tvOS specific attributes.
 void f0_tvos(int) __attribute__((availability(tvos,introduced=2.0,deprecated=2.1))); // expected-note {{'f0_tvos' has been explicitly marked deprecated here}}
 void f1_tvos(int) __attribute__((availability(tvos,introduced=2.1)));
diff --git a/test/Sema/attr-availability.c b/test/Sema/attr-availability.c
index d003e1e2e363b..8fe6be3804edf 100644
--- a/test/Sema/attr-availability.c
+++ b/test/Sema/attr-availability.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -D WARN_PARTIAL -Wpartial-availability -triple x86_64-apple-darwin9 -fsyntax-only -fblocks -verify %s
 // 
 
-void f0() __attribute__((availability(macosx,introduced=10.4,deprecated=10.2))); // expected-warning{{feature cannot be deprecated in OS X version 10.2 before it was introduced in version 10.4; attribute ignored}}
+void f0() __attribute__((availability(macosx,introduced=10.4,deprecated=10.2))); // expected-warning{{feature cannot be deprecated in macOS version 10.2 before it was introduced in version 10.4; attribute ignored}}
 void f1() __attribute__((availability(ios,obsoleted=2.1,deprecated=3.0)));  // expected-warning{{feature cannot be obsoleted in iOS version 2.1 before it was deprecated in version 3.0; attribute ignored}}
 void f2() __attribute__((availability(ios,introduced=2.1,deprecated=2.1)));
 
@@ -26,11 +26,11 @@ enum __attribute__((availability(macosx,introduced=10.8))) PartialEnum {
 };
 
 void test_10095131() {
-  ATSFontGetName("Hello"); // expected-warning {{'ATSFontGetName' is deprecated: first deprecated in OS X 9.0 - use CTFontCopyFullName}}
-  ATSFontGetPostScriptName(100); // expected-error {{'ATSFontGetPostScriptName' is unavailable: obsoleted in OS X 9.0 - use ATSFontGetFullPostScriptName}}
+  ATSFontGetName("Hello"); // expected-warning {{'ATSFontGetName' is deprecated: first deprecated in macOS 9.0 - use CTFontCopyFullName}}
+  ATSFontGetPostScriptName(100); // expected-error {{'ATSFontGetPostScriptName' is unavailable: obsoleted in macOS 9.0 - use ATSFontGetFullPostScriptName}}
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'PartiallyAvailable' to silence this warning}}
+  // expected-warning@+2 {{is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'PartiallyAvailable' to silence this warning}}
 #endif
   PartiallyAvailable();
 }
diff --git a/test/Sema/attr-deprecated.c b/test/Sema/attr-deprecated.c
index 2e3e722942a0f..8566a0e94362b 100644
--- a/test/Sema/attr-deprecated.c
+++ b/test/Sema/attr-deprecated.c
@@ -122,5 +122,10 @@ struct test22 {
 };
 
 typedef int test23_ty __attribute((deprecated));
+// Redefining a typedef is a C11 feature.
+#if __STDC_VERSION__ <= 199901L
+// expected-note@-3 {{'test23_ty' has been explicitly marked deprecated here}}
+#else
 typedef int test23_ty; // expected-note {{'test23_ty' has been explicitly marked deprecated here}}
+#endif
 test23_ty test23_v; // expected-warning {{'test23_ty' is deprecated}}
diff --git a/test/Sema/attr-ifunc.c b/test/Sema/attr-ifunc.c
new file mode 100644
index 0000000000000..d177b7168488f
--- /dev/null
+++ b/test/Sema/attr-ifunc.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple x86_64-windows -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only -DCHECK_ALIASES %s
+// RUN: %clang_cc1 -triple x86_64-linux -fsyntax-only -verify -emit-llvm-only %s
+
+#if defined(_WIN32)
+void foo() {}
+void bar() __attribute__((ifunc("foo")));
+//expected-warning@-1 {{'ifunc' attribute ignored}}
+
+#else
+#if defined(CHECK_ALIASES)
+void* f1_ifunc();
+void f1() __attribute__((ifunc("f1_ifunc")));
+//expected-error@-1 {{ifunc must point to a defined function}}
+
+void* f2_a() __attribute__((ifunc("f2_b")));
+//expected-error@-1 {{ifunc definition is part of a cycle}}
+void* f2_b() __attribute__((ifunc("f2_a")));
+//expected-error@-1 {{ifunc definition is part of a cycle}}
+
+void* f3_a() __attribute__((ifunc("f3_b")));
+//expected-warning@-1 {{ifunc will always resolve to f3_c even if weak definition of f3_b is overridden}}
+void* f3_b() __attribute__((weak, alias("f3_c")));
+void* f3_c() { return 0; }
+
+void f4_ifunc() {}
+void f4() __attribute__((ifunc("f4_ifunc")));
+//expected-error@-1 {{ifunc resolver function must return a pointer}}
+
+void* f5_ifunc(int i) { return 0; }
+void f5() __attribute__((ifunc("f5_ifunc")));
+//expected-error@-1 {{ifunc resolver function must have no parameters}}
+
+#else
+void f1a() __asm("f1");
+void f1a() {}
+//expected-note@-1 {{previous definition is here}}
+void f1() __attribute__((ifunc("f1_ifunc")));
+//expected-error@-1 {{definition with same mangled name as another definition}}
+void* f1_ifunc() { return 0; }
+
+#endif
+#endif
diff --git a/test/Sema/attr-mode-enums.c b/test/Sema/attr-mode-enums.c
new file mode 100644
index 0000000000000..4b98c3b4472d2
--- /dev/null
+++ b/test/Sema/attr-mode-enums.c
@@ -0,0 +1,51 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// Test checks that 'mode' attribute is handled correctly with enums, i. e. code
+//   1. "typedef enum { A } __attribute__((mode(HI))) T;" is accepted,
+//   2. "enum X __attribute__((mode(QI))) var;" forms a complete integer type.
+//   3. "enum { A } __attribute__((mode(V4SI))) var;" is not accepted (vector mode).
+
+typedef enum { E4 } EnumType;
+
+int main() {
+  // Vector mode are not allowed with enums.
+  typedef enum { E1 } __attribute__((mode(V4QI))) RejectedType1; // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+  typedef enum __attribute__((mode(V8HI))) { E2 } RejectedType2; // expected-error{{mode 'V8HI' is not supported for enumeration types}}
+                                                                 // expected-warning@-1{{deprecated}}
+  typedef enum E3 __attribute__((mode(V2SI))) RejectedType3; // expected-error{{mode 'V2SI' is not supported for enumeration types}}
+                                                             // expected-warning@-1{{deprecated}}
+  typedef EnumType __attribute__((mode(V4DI))) RejectedType4; // expected-error{{mode 'V4DI' is not supported for enumeration types}}
+                                                              // expected-warning@-1{{deprecated}}
+  EnumType v1 __attribute__((mode(V4QI))); // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+                                           // expected-warning@-1{{deprecated}}
+  enum __attribute__((mode(V8HI))) { E5 } v2; // expected-error{{mode 'V8HI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Incomplete enums without mode attribute are not allowed.
+  typedef enum Y IncompleteYType; // expected-note{{forward declaration of 'enum Y'}}
+
+  enum X a1; // expected-error{{variable has incomplete type 'enum X'}}
+             // expected-note@-1{{forward declaration of 'enum X'}}
+  IncompleteYType a2; // expected-error{{variable has incomplete type 'IncompleteYType' (aka 'enum Y')}}
+
+  // OK with 'mode' attribute.
+  typedef enum Y __attribute__((mode(QI))) CompleteYType1;
+  typedef enum Y CompleteYType2 __attribute__((mode(HI)));
+  typedef enum { A1, B1 } __attribute__((mode(QI))) CompleteType3;
+  typedef enum { A2, B2 } CompleteType4 __attribute__((mode(QI)));
+  typedef enum __attribute__((mode(QI))) { A3, B3 } CompleteType5;
+
+  enum X __attribute__((mode(QI))) a3;
+  enum X a4 __attribute__((mode(HI)));
+  IncompleteYType __attribute__((mode(QI))) a5;
+  IncompleteYType a6 __attribute__((mode(HI)));
+  CompleteYType1 a7;
+  CompleteYType2 a8;
+  CompleteType3 a9;
+  CompleteType4 a10;
+  CompleteType5 a11;
+  enum __attribute__((mode(QI))) { A4, B4 } a12;
+
+  return 0;
+}
diff --git a/test/Sema/attr-mode.c b/test/Sema/attr-mode.c
index 49e41d210d035..e160d8d4846dd 100644
--- a/test/Sema/attr-mode.c
+++ b/test/Sema/attr-mode.c
@@ -4,6 +4,8 @@
 // RUN:   -verify %s
 // RUN: %clang_cc1 -triple powerpc64-pc-linux-gnu -DTEST_64BIT_PPC64 -fsyntax-only \
 // RUN:   -verify %s
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnux32 -DTEST_64BIT_X86 -fsyntax-only \
+// RUN:   -verify %s
 
 typedef int i16_1 __attribute((mode(HI)));
 int i16_1_test[sizeof(i16_1) == 2 ? 1 : -1];
@@ -24,6 +26,9 @@ typedef unsigned unwind_word __attribute((mode(unwind_word)));
 
 int **__attribute((mode(QI)))* i32;  // expected-error{{mode attribute}}
 
+__attribute__((mode(QI))) int invalid_func() { return 1; } // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
+enum invalid_enum { A1 __attribute__((mode(QI))) }; // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
+
 typedef _Complex double c32 __attribute((mode(SC)));
 int c32_test[sizeof(c32) == 8 ? 1 : -1];
 typedef _Complex float c64 __attribute((mode(DC)));
@@ -60,9 +65,18 @@ void test_int_to_ui32(unsigned int* y) { f_ui32_arg(y); }
 void test_long_to_i64(long long* y) { f_i64_arg(y); }
 void test_long_to_ui64(unsigned long long* y) { f_ui64_arg(y); }
 #elif TEST_64BIT_X86
+#ifdef __ILP32__
+typedef unsigned int gcc_word __attribute__((mode(word)));
+int foo[sizeof(gcc_word) == 8 ? 1 : -1];
+typedef unsigned int gcc_unwind_word __attribute__((mode(unwind_word)));
+int foo[sizeof(gcc_unwind_word) == 8 ? 1 : -1];
+void test_long_to_i64(long long* y) { f_i64_arg(y); }
+void test_long_to_ui64(unsigned long long* y) { f_ui64_arg(y); }
+#else
 void test_long_to_i64(long* y) { f_i64_arg(y); }
 void test_long_to_ui64(unsigned long* y) { f_ui64_arg(y); }
-typedef          float f128ibm __attribute__ ((mode (TF)));     // expected-error{{unsupported machine mode 'TF'}}
+#endif
+typedef          float f128ibm __attribute__ ((mode (TF)));
 #elif TEST_64BIT_PPC64
 typedef          float f128ibm __attribute__ ((mode (TF)));
 typedef _Complex float c128ibm __attribute__ ((mode (TC)));
@@ -73,3 +87,7 @@ void test_TCtype(c128ibm *a) { f_ft128_complex_arg (a); }
 #else
 #error Unknown test architecture.
 #endif
+
+struct S {
+  int n __attribute((mode(HI)));
+};
diff --git a/test/Sema/attr-nodebug.c b/test/Sema/attr-nodebug.c
index 03ec49b850d6f..e7ca58d3ba15a 100644
--- a/test/Sema/attr-nodebug.c
+++ b/test/Sema/attr-nodebug.c
@@ -2,8 +2,8 @@
 
 int a __attribute__((nodebug));
 
-void b() {
-  int b __attribute__((nodebug)); // expected-warning {{'nodebug' only applies to variables with static storage duration and functions}}
+void b(int p __attribute__((nodebug))) { // expected-warning {{'nodebug' attribute only applies to variables and functions}}
+  int b __attribute__((nodebug));
 }
 
 void t1() __attribute__((nodebug));
diff --git a/test/Sema/attr-print.c b/test/Sema/attr-print.c
index b3bdfd72e6457..7ffbbb800d51c 100644
--- a/test/Sema/attr-print.c
+++ b/test/Sema/attr-print.c
@@ -32,3 +32,6 @@ int * __uptr __ptr32 p32_3;
 
 // CHECK: int * __sptr * __ptr32 ppsp32;
 int * __sptr * __ptr32 ppsp32;
+
+// CHECK: __attribute__((availability(macos, strict, introduced=10.6)));
+void f6(int) __attribute__((availability(macosx,strict,introduced=10.6)));
diff --git a/test/Sema/attr-swiftcall.c b/test/Sema/attr-swiftcall.c
new file mode 100644
index 0000000000000..3458167cf2e86
--- /dev/null
+++ b/test/Sema/attr-swiftcall.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify %s
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define INDIRECT_RESULT __attribute__((swift_indirect_result))
+#define ERROR_RESULT __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+int notAFunction SWIFTCALL; // expected-warning {{'swiftcall' only applies to function types; type here is 'int'}}
+void variadic(int x, ...) SWIFTCALL; // expected-error {{variadic function cannot use swiftcall calling convention}}
+void unprototyped() SWIFTCALL; // expected-error {{function with no prototype cannot use the swiftcall calling convention}}
+void multiple_ccs(int x) SWIFTCALL __attribute__((vectorcall)); // expected-error {{vectorcall and swiftcall attributes are not compatible}}
+void (*functionPointer)(void) SWIFTCALL;
+
+void indirect_result_nonswift(INDIRECT_RESULT void *out); // expected-error {{'swift_indirect_result' parameter can only be used with swiftcall calling convention}}
+void indirect_result_bad_position(int first, INDIRECT_RESULT void *out) SWIFTCALL; // expected-error {{'swift_indirect_result' parameters must be first parameters of function}}
+void indirect_result_bad_type(INDIRECT_RESULT int out) SWIFTCALL; // expected-error {{'swift_indirect_result' parameter must have pointer type; type here is 'int'}}
+void indirect_result_single(INDIRECT_RESULT void *out) SWIFTCALL;
+void indirect_result_multiple(INDIRECT_RESULT void *out1, INDIRECT_RESULT void *out2) SWIFTCALL;
+
+void error_result_nonswift(ERROR_RESULT void **error); // expected-error {{'swift_error_result' parameter can only be used with swiftcall calling convention}} expected-error{{'swift_error_result' parameter must follow 'swift_context' parameter}}
+void error_result_bad_position(ERROR_RESULT void **error, int last) SWIFTCALL; // expected-error {{'swift_error_result' parameter must be last parameter of function}}
+void error_result_bad_position2(int first, ERROR_RESULT void **error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must follow 'swift_context' parameter}}
+void error_result_bad_type(CONTEXT void *context, ERROR_RESULT int error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must have pointer to unqualified pointer type; type here is 'int'}}
+void error_result_bad_type2(CONTEXT void *context, ERROR_RESULT int *error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must have pointer to unqualified pointer type; type here is 'int *'}}
+void error_result_okay(int a, int b, CONTEXT void *context, ERROR_RESULT void **error) SWIFTCALL;
+
+void context_nonswift(CONTEXT void *context); // expected-error {{'swift_context' parameter can only be used with swiftcall calling convention}}
+void context_bad_position(CONTEXT void *context, int x) SWIFTCALL; // expected-error {{'swift_context' parameter can only be followed by 'swift_error_result' parameter}}
+void context_bad_type(CONTEXT int context) SWIFTCALL; // expected-error {{'swift_context' parameter must have pointer type; type here is 'int'}}
+void context_okay(CONTEXT void *context) SWIFTCALL;
diff --git a/test/Sema/attr-x86-interrupt.c b/test/Sema/attr-x86-interrupt.c
new file mode 100644
index 0000000000000..0785fdfcb1b30
--- /dev/null
+++ b/test/Sema/attr-x86-interrupt.c
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-pc-win32  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i386-pc-win32  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnux32  -fsyntax-only -verify %s
+
+struct a {
+  int b;
+};
+
+struct a test __attribute__((interrupt)); // expected-warning {{'interrupt' attribute only applies to non-K&R-style functions}}
+
+__attribute__((interrupt)) int foo1(void) { return 0; }             // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'void' return type}}
+__attribute__((interrupt)) void foo2(void) {}                       // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have only a pointer parameter optionally followed by an integer parameter}}
+__attribute__((interrupt)) void foo3(void *a, unsigned b, int c) {} // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have only a pointer parameter optionally followed by an integer parameter}}
+__attribute__((interrupt)) void foo4(int a) {}                      // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a pointer as the first parameter}}
+#ifdef _LP64
+// expected-error-re@+6 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#else
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned int' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo5(void *a, float b) {}
+#ifdef _LP64
+// expected-error-re@+6 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#else
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned int' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo6(float *a, int b) {}
+
+#ifdef _LP64
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo7(int *a, unsigned b) {}
+__attribute__((interrupt)) void foo8(int *a) {}
+
+void g(void (*fp)(int *));
+int main(int argc, char **argv) {
+  void *ptr = (void *)&foo7;
+  g(foo8);
+
+  (void)ptr;
+#ifndef __x86_64__
+  // expected-error@+2 {{interrupt service routine cannot be called directly}}
+#endif
+  foo7((int *)argv, argc);
+  foo8((int *)argv);       // expected-error {{interrupt service routine cannot be called directly}}
+  return 0;
+}
+
diff --git a/test/Sema/bitfield-layout.c b/test/Sema/bitfield-layout.c
index b96b386864172..079720cc9b40b 100644
--- a/test/Sema/bitfield-layout.c
+++ b/test/Sema/bitfield-layout.c
@@ -1,6 +1,8 @@
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=i686-apple-darwin9
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=arm-linux-gnueabihf
 // RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-scei-ps4
 // expected-no-diagnostics
 #include <stddef.h>
 
@@ -95,9 +97,15 @@ struct g0 {
   char c;
 };
 
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g0, 16);
+CHECK_ALIGN(struct, g0, 16);
+CHECK_OFFSET(struct, g0, c, 2);
+#else
 CHECK_SIZE(struct, g0, 32);
 CHECK_ALIGN(struct, g0, 16);
 CHECK_OFFSET(struct, g0, c, 17);
+#endif
 
 // Bit-field with explicit align smaller than normal.
 struct g1 {
@@ -108,7 +116,11 @@ struct g1 {
 
 CHECK_SIZE(struct, g1, 4);
 CHECK_ALIGN(struct, g1, 4);
+#if defined(__ORBIS__)
+CHECK_OFFSET(struct, g1, c, 2);
+#else
 CHECK_OFFSET(struct, g1, c, 3);
+#endif
 
 // Same as above but without explicit align.
 struct g2 {
@@ -129,9 +141,14 @@ struct __attribute__((packed)) g3 {
   char c;
 };
 
-CHECK_SIZE(struct, g3, 32);
 CHECK_ALIGN(struct, g3, 16);
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g3, 16);
+CHECK_OFFSET(struct, g3, c, 2);
+#else
+CHECK_SIZE(struct, g3, 32);
 CHECK_OFFSET(struct, g3, c, 17);
+#endif
 
 struct __attribute__((packed)) g4 {
   char a;
@@ -141,7 +158,11 @@ struct __attribute__((packed)) g4 {
 
 CHECK_SIZE(struct, g4, 4);
 CHECK_ALIGN(struct, g4, 2);
+#if defined(__ORBIS__)
+CHECK_OFFSET(struct, g4, c, 2);
+#else
 CHECK_OFFSET(struct, g4, c, 3);
+#endif
 
 struct g5 {
   char : 1;
@@ -161,28 +182,44 @@ struct g7 {
   char : 1;
   __attribute__((aligned(1))) int n : 25;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g7, 4);
+#else
 CHECK_SIZE(struct, g7, 8);
+#endif
 CHECK_ALIGN(struct, g7, 4);
 
 struct __attribute__((packed)) g8 {
   char : 1;
   __attribute__((aligned(1))) int n : 25;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g8, 4);
+#else
 CHECK_SIZE(struct, g8, 5);
+#endif
 CHECK_ALIGN(struct, g8, 1);
 
 struct g9 {
   __attribute__((aligned(1))) char a : 2, b : 2, c : 2, d : 2, e : 2;
   int i;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g9, 8);
+#else
 CHECK_SIZE(struct, g9, 12);
+#endif
 CHECK_ALIGN(struct, g9, 4);
 
 struct __attribute__((packed)) g10 {
   __attribute__((aligned(1))) char a : 2, b : 2, c : 2, d : 2, e : 2;
   int i;
 };
+#if defined(__ORBIS__)
+CHECK_SIZE(struct, g10, 6);
+#else
 CHECK_SIZE(struct, g10, 9);
+#endif
 CHECK_ALIGN(struct, g10, 1);
 
 struct g11 {
@@ -190,7 +227,7 @@ struct g11 {
   __attribute__((aligned(1))) long long b : 62;
   char c;
 };
-#if defined(__arm__) || defined(__aarch64__)
+#if defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)
 CHECK_SIZE(struct, g11, 24);
 CHECK_ALIGN(struct, g11, 8);
 CHECK_OFFSET(struct, g11, c, 16);
@@ -218,6 +255,10 @@ struct g13 {
 CHECK_SIZE(struct, g13, 16);
 CHECK_ALIGN(struct, g13, 8);
 CHECK_OFFSET(struct, g13, c, 8);
+#elif defined(__x86_64__)
+CHECK_SIZE(struct, g13, 9);
+CHECK_ALIGN(struct, g13, 1);
+CHECK_OFFSET(struct, g13, c, 8);
 #else
 CHECK_SIZE(struct, g13, 5);
 CHECK_ALIGN(struct, g13, 1);
@@ -233,6 +274,10 @@ struct __attribute__((packed)) g14 {
 CHECK_SIZE(struct, g14, 16);
 CHECK_ALIGN(struct, g14, 8);
 CHECK_OFFSET(struct, g14, c, 8);
+#elif defined(__x86_64__)
+CHECK_SIZE(struct, g14, 9);
+CHECK_ALIGN(struct, g14, 1);
+CHECK_OFFSET(struct, g14, c, 8);
 #else
 CHECK_SIZE(struct, g14, 5);
 CHECK_ALIGN(struct, g14, 1);
diff --git a/test/Sema/bitfield-layout_1.c b/test/Sema/bitfield-layout_1.c
new file mode 100644
index 0000000000000..24277c3911495
--- /dev/null
+++ b/test/Sema/bitfield-layout_1.c
@@ -0,0 +1,202 @@
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=i686-apple-darwin9
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=arm-linux-gnueabihf
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=aarch64-linux-gnu
+// RUN: %clang_cc1 %s -fsyntax-only -verify -triple=x86_64-pc-linux-gnu
+// expected-no-diagnostics
+
+#define CHECK_SIZE(name, size) \
+  extern int name##_1[sizeof(name) == size ? 1 : -1];
+
+
+struct  __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s0;
+CHECK_SIZE(s0,9)
+
+#pragma pack (1)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s1;
+CHECK_SIZE(s1,9)
+
+#pragma pack (2)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s2;
+CHECK_SIZE(s2,10)
+
+#pragma pack (2)
+struct __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s3;
+CHECK_SIZE(s3,10)
+
+#pragma pack (4)
+struct  __attribute__((packed)) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s4;
+CHECK_SIZE(s4,12)
+
+#pragma pack (16)
+struct {
+  int a;
+  int __attribute__((packed)) b : 4;
+  int __attribute__((packed)) c : 32;
+} s41;
+CHECK_SIZE(s41,12)
+
+#pragma pack (16)
+struct {
+  int a;
+  int b : 4;
+  int c : 32;
+} s5;
+CHECK_SIZE(s5,12)
+
+#pragma pack (1)
+struct  __attribute__((aligned(4))) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s6;
+CHECK_SIZE(s6,12)
+
+#pragma pack (2)
+struct {
+  char a;
+  int b : 4;
+  int c : 32;
+  char s;
+} s7;
+CHECK_SIZE(s7,8)
+
+#pragma pack (1)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s;
+} s8;
+CHECK_SIZE(s8,6)
+
+#pragma pack (8)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s;
+} s9;
+CHECK_SIZE(s9,8)
+
+#pragma pack (8)
+struct {
+  char a;
+  char s;
+} s10;
+CHECK_SIZE(s10,2)
+
+#pragma pack(4)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s1;
+  char s2;
+  char s3;
+} s11;
+CHECK_SIZE(s11,8)
+
+#pragma pack(4)
+struct {
+  short s1;
+  int a1 : 17;
+  int a2 : 17;
+  int a3 : 30;
+  short s2;
+} s12;
+CHECK_SIZE(s12,12)
+
+#pragma pack(4)
+struct {
+  char c1;
+  int i1 : 17;
+  int i2 : 17;
+  int i3 : 30;
+  char c2;
+} s13;
+CHECK_SIZE(s13,12)
+
+#pragma pack(2)
+struct {
+  char a;
+  int s;
+} s14;
+CHECK_SIZE(s14,6)
+
+#pragma pack(4)
+struct {
+  char a;
+  short s;
+} s15;
+CHECK_SIZE(s15,4)
+
+#pragma pack(2)
+struct {
+  char a;
+  int b : 4;
+  int c : 28;
+  char s1;
+  char s2;
+  char s3;
+} s16;
+CHECK_SIZE(s16,8)
+
+#pragma pack (16)
+struct {
+  int __attribute__((packed)) a;
+  int __attribute__((packed)) b : 4;
+  int __attribute__((packed)) c : 32;
+} s17;
+CHECK_SIZE(s17,12)
+
+#pragma pack (16)
+struct {
+  int __attribute__((aligned(8))) a;
+  int __attribute__((aligned(8))) b : 4;
+  int __attribute__((aligned(8))) c : 32;
+} s18;
+CHECK_SIZE(s18,24)
+
+#pragma pack (16)
+struct {
+  int __attribute__((aligned(1))) a;
+  int __attribute__((aligned(1))) b : 4;
+  int __attribute__((aligned(1))) c : 32;
+} s19;
+CHECK_SIZE(s19,12)
+
+#pragma pack (1)
+struct  __attribute__((aligned(8))) {
+  int a;
+  int b : 4;
+  int c : 32;
+} s20;
+CHECK_SIZE(s20,16)
+
+#pragma pack (2)
+struct {
+  int __attribute__((aligned(8))) a;
+  int __attribute__((aligned(8))) b : 4;
+  int __attribute__((aligned(8))) c : 32;
+} s21;
+CHECK_SIZE(s21,10)
diff --git a/test/Sema/builtin-classify-type.c b/test/Sema/builtin-classify-type.c
new file mode 100644
index 0000000000000..376e73d1118c9
--- /dev/null
+++ b/test/Sema/builtin-classify-type.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+enum gcc_type_class {
+  no_type_class = -1,
+  void_type_class, integer_type_class, char_type_class,
+  enumeral_type_class, boolean_type_class,
+  pointer_type_class, reference_type_class, offset_type_class,
+  real_type_class, complex_type_class,
+  function_type_class, method_type_class,
+  record_type_class, union_type_class,
+  array_type_class, string_type_class,
+  lang_type_class
+};
+
+void foo() {
+  int i;
+  char c;
+  enum { red, green, blue } enum_obj;
+  int *p;
+  double d;
+  _Complex double cc;
+  extern void f();
+  struct { int a; float b; } s_obj;
+  union { int a; float b; } u_obj;
+  int arr[10];
+
+  int a1[__builtin_classify_type(f()) == void_type_class ? 1 : -1];
+  int a2[__builtin_classify_type(i) == integer_type_class ? 1 : -1];
+  int a3[__builtin_classify_type(c) == integer_type_class ? 1 : -1];
+  int a4[__builtin_classify_type(enum_obj) == integer_type_class ? 1 : -1];
+  int a5[__builtin_classify_type(p) == pointer_type_class ? 1 : -1];
+  int a6[__builtin_classify_type(d) == real_type_class ? 1 : -1];
+  int a7[__builtin_classify_type(cc) == complex_type_class ? 1 : -1];
+  int a8[__builtin_classify_type(f) == pointer_type_class ? 1 : -1];
+  int a0[__builtin_classify_type(s_obj) == record_type_class ? 1 : -1];
+  int a10[__builtin_classify_type(u_obj) == union_type_class ? 1 : -1];
+  int a11[__builtin_classify_type(arr) == pointer_type_class ? 1 : -1];
+  int a12[__builtin_classify_type("abc") == pointer_type_class ? 1 : -1];
+}
+
diff --git a/test/Sema/builtin-longjmp.c b/test/Sema/builtin-longjmp.c
index fdfbcf861ddaf..d80208f82c67b 100644
--- a/test/Sema/builtin-longjmp.c
+++ b/test/Sema/builtin-longjmp.c
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -triple x86_64-windows -emit-llvm < %s| FileCheck %s
 // RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm < %s| FileCheck %s
 // RUN: %clang_cc1 -triple powerpc64-unknown-unknown -emit-llvm < %s| FileCheck %s
+// RUN: %clang_cc1 -triple sparc-eabi-unknown -emit-llvm < %s | FileCheck %s
 
 // RUN: %clang_cc1 -triple aarch64-unknown-unknown -emit-llvm-only -verify %s
 // RUN: %clang_cc1 -triple mips-unknown-unknown -emit-llvm-only -verify %s
diff --git a/test/Sema/builtin-object-size.c b/test/Sema/builtin-object-size.c
index b1bda0652c146..14674c66f3a66 100644
--- a/test/Sema/builtin-object-size.c
+++ b/test/Sema/builtin-object-size.c
@@ -52,3 +52,27 @@ void f6(void)
   __builtin___memccpy_chk (buf, b, '\0', sizeof(b), __builtin_object_size (buf, 0));
   __builtin___memccpy_chk (b, buf, '\0', sizeof(buf), __builtin_object_size (b, 0));  // expected-warning {{'__builtin___memccpy_chk' will always overflow destination buffer}}
 }
+
+int pr28314(void) {
+  struct {
+    struct InvalidField a; // expected-error{{has incomplete type}} expected-note 3{{forward declaration of 'struct InvalidField'}}
+    char b[0];
+  } *p;
+
+  struct {
+    struct InvalidField a; // expected-error{{has incomplete type}}
+    char b[1];
+  } *p2;
+
+  struct {
+    struct InvalidField a; // expected-error{{has incomplete type}}
+    char b[2];
+  } *p3;
+
+  int a = 0;
+  a += __builtin_object_size(&p->a, 0);
+  a += __builtin_object_size(p->b, 0);
+  a += __builtin_object_size(p2->b, 0);
+  a += __builtin_object_size(p3->b, 0);
+  return a;
+}
diff --git a/test/Sema/builtins-arm.c b/test/Sema/builtins-arm.c
index 39cb2fa2962c1..668b8284ffeb8 100644
--- a/test/Sema/builtins-arm.c
+++ b/test/Sema/builtins-arm.c
@@ -48,6 +48,50 @@ void test5() {
 }
 
 void test6(int a, int b, int c) {
+  __builtin_arm_ldc(1, 2, &a);
+  __builtin_arm_ldc(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc' must be a constant integer}}
+  __builtin_arm_ldc(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc' must be a constant integer}}
+
+  __builtin_arm_ldcl(1, 2, &a);
+  __builtin_arm_ldcl(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldcl' must be a constant integer}}
+  __builtin_arm_ldcl(1, a, &a); // expected-error {{argument to '__builtin_arm_ldcl' must be a constant integer}}
+
+  __builtin_arm_ldc2(1, 2, &a);
+  __builtin_arm_ldc2(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc2' must be a constant integer}}
+  __builtin_arm_ldc2(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc2' must be a constant integer}}
+
+  __builtin_arm_ldc2l(1, 2, &a);
+  __builtin_arm_ldc2l(a, 2, &a); // expected-error {{argument to '__builtin_arm_ldc2l' must be a constant integer}}
+  __builtin_arm_ldc2l(1, a, &a); // expected-error {{argument to '__builtin_arm_ldc2l' must be a constant integer}}
+
+  __builtin_arm_stc(1, 2, &a);
+  __builtin_arm_stc(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc' must be a constant integer}}
+  __builtin_arm_stc(1, a, &a); // expected-error {{argument to '__builtin_arm_stc' must be a constant integer}}
+
+  __builtin_arm_stcl(1, 2, &a);
+  __builtin_arm_stcl(a, 2, &a); // expected-error {{argument to '__builtin_arm_stcl' must be a constant integer}}
+  __builtin_arm_stcl(1, a, &a); // expected-error {{argument to '__builtin_arm_stcl' must be a constant integer}}
+
+  __builtin_arm_stc2(1, 2, &a);
+  __builtin_arm_stc2(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc2' must be a constant integer}}
+  __builtin_arm_stc2(1, a, &a); // expected-error {{argument to '__builtin_arm_stc2' must be a constant integer}}
+
+  __builtin_arm_stc2l(1, 2, &a);
+  __builtin_arm_stc2l(a, 2, &a); // expected-error {{argument to '__builtin_arm_stc2l' must be a constant integer}}
+  __builtin_arm_stc2l(1, a, &a); // expected-error {{argument to '__builtin_arm_stc2l' must be a constant integer}}
+
+  __builtin_arm_cdp(a, 2, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, a, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, a, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, 3, a, 5, 6); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+  __builtin_arm_cdp(1, 2, 3, 4, 5, a); // expected-error {{argument to '__builtin_arm_cdp' must be a constant integer}}
+
+  __builtin_arm_cdp2(a, 2, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, a, 3, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, a, 4, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, 3, a, 5, 6); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+  __builtin_arm_cdp2(1, 2, 3, 4, 5, a); // expected-error {{argument to '__builtin_arm_cdp2' must be a constant integer}}
+
   __builtin_arm_mrc( a, 0, 13, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
   __builtin_arm_mrc(15, a, 13, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
   __builtin_arm_mrc(15, 0,  a, 0, 3); // expected-error {{argument to '__builtin_arm_mrc' must be a constant integer}}
@@ -72,11 +116,23 @@ void test6(int a, int b, int c) {
   __builtin_arm_mcr2(15, 0, b, 13, a, 3); // expected-error {{argument to '__builtin_arm_mcr2' must be a constant integer}}
   __builtin_arm_mcr2(15, 0, b, 13, 0, a); // expected-error {{argument to '__builtin_arm_mcr2' must be a constant integer}}
 
-  __builtin_arm_mcrr( a, 0, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
-  __builtin_arm_mcrr(15, a, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
-  __builtin_arm_mcrr(15, 0, b, c, a); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, 0, b, 0);
+  __builtin_arm_mcrr( a, 0, b, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, a, b, 0); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+  __builtin_arm_mcrr(15, 0, b, a); // expected-error {{argument to '__builtin_arm_mcrr' must be a constant integer}}
+
+  __builtin_arm_mcrr2(15, 0, b, 0);
+  __builtin_arm_mcrr2( a, 0, b, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mcrr2(15, a, b, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mcrr2(15, 0, b, a); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+
+  __builtin_arm_mrrc(15, 0, 0);
+  __builtin_arm_mrrc( a, 0, 0); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
+  __builtin_arm_mrrc(15, a, 0); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
+  __builtin_arm_mrrc(15, 0, a); // expected-error {{argument to '__builtin_arm_mrrc' must be a constant integer}}
 
-  __builtin_arm_mcrr2( a, 0, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
-  __builtin_arm_mcrr2(15, a, b, c, 0); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
-  __builtin_arm_mcrr2(15, 0, b, c, a); // expected-error {{argument to '__builtin_arm_mcrr2' must be a constant integer}}
+  __builtin_arm_mrrc2(15, 0, 0);
+  __builtin_arm_mrrc2( a, 0, 0); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
+  __builtin_arm_mrrc2(15, a, 0); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
+  __builtin_arm_mrrc2(15, 0, a); // expected-error {{argument to '__builtin_arm_mrrc2' must be a constant integer}}
 }
diff --git a/test/Sema/callingconv-cast.c b/test/Sema/callingconv-cast.c
new file mode 100644
index 0000000000000..12c0dcbc256ac
--- /dev/null
+++ b/test/Sema/callingconv-cast.c
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -Wno-pointer-bool-conversion -verify -x c %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -Wno-pointer-bool-conversion -verify -x c++ %s
+// RUN: %clang_cc1 -fms-extensions -triple i686-pc-windows-msvc -Wcast-calling-convention -DMSVC -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s --check-prefix=MSFIXIT
+// RUN: %clang_cc1 -triple i686-pc-windows-gnu -Wcast-calling-convention -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s --check-prefix=GNUFIXIT
+
+// expected-note@+1 {{consider defining 'mismatched_before_winapi' with the 'stdcall' calling convention}}
+void mismatched_before_winapi(int x) {}
+
+#ifdef MSVC
+#define WINAPI __stdcall
+#else
+#define WINAPI __attribute__((stdcall))
+#endif
+
+// expected-note@+1 3 {{consider defining 'mismatched' with the 'stdcall' calling convention}}
+void mismatched(int x) {}
+
+typedef void (WINAPI *callback_t)(int);
+void take_callback(callback_t callback);
+
+void WINAPI mismatched_stdcall(int x) {}
+
+void take_opaque_fn(void (*callback)(int));
+
+int main() {
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  take_callback((callback_t)mismatched);
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback_t callback = (callback_t)mismatched; // warns
+  (void)callback;
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback = (callback_t)&mismatched; // warns
+
+  // No warning, just to show we don't drill through other kinds of unary operators.
+  callback = (callback_t)!mismatched;
+
+  // expected-warning@+1 {{cast between incompatible calling conventions 'cdecl' and 'stdcall'}}
+  callback = (callback_t)&mismatched_before_winapi; // warns
+
+  // Probably a bug, but we don't warn.
+  void (*callback2)(int) = mismatched;
+  take_callback((callback_t)callback2);
+
+  // Another way to suppress the warning.
+  take_callback((callback_t)(void*)mismatched);
+
+  // Don't warn, because we're casting from stdcall to cdecl. Usually that means
+  // the programmer is rinsing the function pointer through some kind of opaque
+  // API.
+  take_opaque_fn((void (*)(int))mismatched_stdcall);
+}
+
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// MSFIXIT: fix-it:"{{.*}}callingconv-cast.c":{7:6-7:6}:"__stdcall "
+
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{16:6-16:6}:"WINAPI "
+// GNUFIXIT: fix-it:"{{.*}}callingconv-cast.c":{7:6-7:6}:"__attribute__((stdcall)) "
diff --git a/test/Sema/constant-conversion.c b/test/Sema/constant-conversion.c
index 1376333967127..b717f93253364 100644
--- a/test/Sema/constant-conversion.c
+++ b/test/Sema/constant-conversion.c
@@ -80,3 +80,36 @@ void test8() {
   struct { enum E x : 1; } f;
   f.x = C; // expected-warning {{implicit truncation from 'int' to bitfield changes value from 2 to 0}}
 }
+
+void test9() {
+  const char max_char = 0x7F;
+  const short max_short = 0x7FFF;
+  const int max_int = 0x7FFFFFFF;
+
+  const short max_char_plus_one = (short)max_char + 1;
+  const int max_short_plus_one = (int)max_short + 1;
+  const long max_int_plus_one = (long)max_int + 1;
+
+  char new_char = max_char_plus_one;  // expected-warning {{implicit conversion from 'const short' to 'char' changes value from 128 to -128}}
+  short new_short = max_short_plus_one;  // expected-warning {{implicit conversion from 'const int' to 'short' changes value from 32768 to -32768}}
+  int new_int = max_int_plus_one;  // expected-warning {{implicit conversion from 'const long' to 'int' changes value from 2147483648 to -2147483648}}
+
+  char hex_char = 0x80;
+  short hex_short = 0x8000;
+  int hex_int = 0x80000000;
+
+  char oct_char = 0200;
+  short oct_short = 0100000;
+  int oct_int = 020000000000;
+
+  char bin_char = 0b10000000;
+  short bin_short = 0b1000000000000000;
+  int bin_int = 0b10000000000000000000000000000000;
+
+#define CHAR_MACRO_HEX 0xff
+  char macro_char_hex = CHAR_MACRO_HEX;
+#define CHAR_MACRO_DEC 255
+  char macro_char_dec = CHAR_MACRO_DEC;  // expected-warning {{implicit conversion from 'int' to 'char' changes value from 255 to -1}}
+
+  char array_init[] = { 255, 127, 128, 129, 0 };
+}
diff --git a/test/Sema/dllexport.c b/test/Sema/dllexport.c
index 56c9e74225f29..7991a455b4de9 100644
--- a/test/Sema/dllexport.c
+++ b/test/Sema/dllexport.c
@@ -4,12 +4,18 @@
 // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 %s
 
 // Invalid usage.
-__declspec(dllexport) typedef int typedef1; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef __declspec(dllexport) int typedef2; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef int __declspec(dllexport) typedef3; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-typedef __declspec(dllexport) void (*FunTy)(); // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-enum __declspec(dllexport) Enum { EnumVal }; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
-struct __declspec(dllexport) Record {}; // expected-warning{{'dllexport' attribute only applies to variables and functions}}
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+enum __declspec(dllexport) Enum { EnumVal };
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
+struct __declspec(dllexport) Record {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables and functions}}
 
 
 
diff --git a/test/Sema/dllimport.c b/test/Sema/dllimport.c
index f863499cf8402..0728cf14a8e3f 100644
--- a/test/Sema/dllimport.c
+++ b/test/Sema/dllimport.c
@@ -4,12 +4,18 @@
 // RUN: %clang_cc1 -triple x86_64-mingw32 -fsyntax-only -fms-extensions -verify -std=c99 -DGNU %s
 
 // Invalid usage.
-__declspec(dllimport) typedef int typedef1; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef __declspec(dllimport) int typedef2; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef int __declspec(dllimport) typedef3; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-typedef __declspec(dllimport) void (*FunTy)(); // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-enum __declspec(dllimport) Enum { EnumVal }; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
-struct __declspec(dllimport) Record {}; // expected-warning{{'dllimport' attribute only applies to variables and functions}}
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+enum __declspec(dllimport) Enum { EnumVal };
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
+struct __declspec(dllimport) Record {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables and functions}}
 
 
 
@@ -34,17 +40,49 @@ __declspec(dllimport) int GlobalInit1 = 1; // expected-error{{definition of dlli
 int __declspec(dllimport) GlobalInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int ExternGlobalDeclInit = 1; // expected-warning{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int ExternGlobalDeclInit = 1;
 
-__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclInit = 1; // expected-warning{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclInit = 1;
 
-int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int *GlobalDeclChunkAttrInit = 0; // expected-warning{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int *GlobalDeclChunkAttrInit = 0;
 
-int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclAttrInit = 1; // expected-warning{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclAttrInit = 1;
 
 // Redeclarations
 __declspec(dllimport) extern int GlobalRedecl1;
@@ -59,8 +97,7 @@ int *__attribute__((dllimport)) GlobalRedecl2b;
 int GlobalRedecl2c __attribute__((dllimport));
 int GlobalRedecl2c __attribute__((dllimport));
 
-// NB: MSVC issues a warning and makes GlobalRedecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
+// We follow GCC and drop the dllimport with a warning.
 __declspec(dllimport) extern int GlobalRedecl3; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       extern int GlobalRedecl3; // expected-warning{{'GlobalRedecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
@@ -133,13 +170,20 @@ inline void __attribute__((dllimport)) inlineFunc2() {}
 __declspec(dllimport) void redecl1();
 __declspec(dllimport) void redecl1();
 
-// NB: MSVC issues a warning and makes redecl2/redecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) void redecl2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       void redecl2(); // expected-warning{{'redecl2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
-__declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-                      void redecl3() {} // expected-warning{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+                      // expected-note@+2{{previous attribute is here}}
+#endif
+                      __declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}}
+                      // NB: Both MSVC and Clang issue a warning and make redecl3 dllexport.
+#ifdef MS
+                      // expected-warning@+4{{'redecl3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void redecl3() {}
 
                       void redecl4(); // expected-note{{previous declaration is here}}
 void useRedecl4() { redecl4(); }
diff --git a/test/Sema/enable_if-ext.c b/test/Sema/enable_if-ext.c
new file mode 100644
index 0000000000000..1e605d49b60fd
--- /dev/null
+++ b/test/Sema/enable_if-ext.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 -fsyntax-only %s -include %s -verify
+// RUN: %clang_cc1 -Wpedantic -fsyntax-only %s -include %s -verify -DWARN_PEDANTIC
+
+#ifndef enable_if_ext_included
+#define enable_if_ext_included
+
+#if !defined(WARN_PEDANTIC)
+// expected-no-diagnostics
+#endif
+
+__attribute__ (( enable_if(1, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void f() { }
+
+__attribute__ (( __enable_if__(1, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void g() { }
+
+__attribute__ (( enable_if(0, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void h() { }
+
+__attribute__ (( __enable_if__(0, "") ))
+#if defined(WARN_PEDANTIC)
+// expected-warning@-2 {{'enable_if' is a clang extension}}
+#endif
+void i() { }
+
+#pragma clang system_header
+
+__attribute__ (( enable_if(1, "") ))
+void j() { }
+
+__attribute__ (( __enable_if__(1, "") ))
+void k() { }
+
+__attribute__ (( enable_if(0, "") ))
+void l() { }
+
+__attribute__ (( __enable_if__(0, "") ))
+void m() { }
+
+#endif
+
diff --git a/test/Sema/enable_if.c b/test/Sema/enable_if.c
index 0cd9c48f42b58..1cc14659021dd 100644
--- a/test/Sema/enable_if.c
+++ b/test/Sema/enable_if.c
@@ -72,8 +72,8 @@ int isdigit(int c) __attribute__((overloadable))  // expected-note{{candidate fu
   __attribute__((unavailable("'c' must have the value of an unsigned char or EOF")));
 
 void test3(int c) {
-  isdigit(c);
-  isdigit(10);
+  isdigit(c); // expected-warning{{ignoring return value of function declared with pure attribute}}
+  isdigit(10); // expected-warning{{ignoring return value of function declared with pure attribute}}
 #ifndef CODEGEN
   isdigit(-10);  // expected-error{{call to unavailable function 'isdigit': 'c' must have the value of an unsigned char or EOF}}
 #endif
@@ -142,4 +142,11 @@ void test8() {
   void (*p1)(int) = &f4; // expected-error{{cannot take address of function 'f4' becuase it has one or more non-tautological enable_if conditions}}
   void (*p2)(int) = f4; // expected-error{{cannot take address of function 'f4' becuase it has one or more non-tautological enable_if conditions}}
 }
+
+void regular_enable_if(int a) __attribute__((enable_if(a, ""))); // expected-note 3{{declared here}}
+void PR27122_ext() {
+  regular_enable_if(0, 2); // expected-error{{too many arguments}}
+  regular_enable_if(1, 2); // expected-error{{too many arguments}}
+  regular_enable_if(); // expected-error{{too few arguments}}
+}
 #endif
diff --git a/test/Sema/ext_vector_casts.c b/test/Sema/ext_vector_casts.c
index 924013aeb29d2..6aaedbe7fd15d 100644
--- a/test/Sema/ext_vector_casts.c
+++ b/test/Sema/ext_vector_casts.c
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-macos10.7.0 -fsyntax-only -verify -fno-lax-vector-conversions -Wconversion %s
 
+typedef __attribute__((ext_vector_type(8))) _Bool BoolVector; // expected-error {{invalid vector element type '_Bool'}}
+
 typedef __attribute__(( ext_vector_type(2) )) float float2;
 typedef __attribute__(( ext_vector_type(3) )) float float3;
 typedef __attribute__(( ext_vector_type(4) )) int int4;
diff --git a/test/Sema/float128-ld-incompatibility.cpp b/test/Sema/float128-ld-incompatibility.cpp
new file mode 100644
index 0000000000000..d993ed7b081c3
--- /dev/null
+++ b/test/Sema/float128-ld-incompatibility.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 \
+// RUN: -triple powerpc64le-unknown-linux-gnu -target-cpu pwr8 \
+// RUN: -target-feature +float128 %s
+
+__float128 qf();
+long double ldf();
+
+// FIXME: once operations between long double and __float128 are implemented for
+//        targets where the types are different, these next two will change
+long double ld{qf()}; // expected-error {{cannot initialize a variable of type 'long double' with an rvalue of type '__float128'}}
+__float128 q{ldf()};  // expected-error {{cannot initialize a variable of type '__float128' with an rvalue of type 'long double'}}
+
+auto test1(__float128 q, long double ld) -> decltype(q + ld) { // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  return q + ld;      // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+}
+
+auto test2(long double a, __float128 b) -> decltype(a + b) { // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  return a + b;      // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+}
+
+void test3(bool b) {
+  long double ld;
+  __float128 q;
+
+  ld + q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q + ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld - q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q - ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld * q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q * ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld / q; // expected-error {{invalid operands to binary expression ('long double' and '__float128')}}
+  q / ld; // expected-error {{invalid operands to binary expression ('__float128' and 'long double')}}
+  ld = q; // expected-error {{assigning to 'long double' from incompatible type '__float128'}}
+  q = ld; // expected-error {{assigning to '__float128' from incompatible type 'long double'}}
+  q + b ? q : ld; // expected-error {{incompatible operand types ('__float128' and 'long double')}}
+}
diff --git a/test/Sema/format-strings-freebsd.c b/test/Sema/format-strings-freebsd.c
index cdf273ae10938..965d7c287be65 100644
--- a/test/Sema/format-strings-freebsd.c
+++ b/test/Sema/format-strings-freebsd.c
@@ -1,10 +1,11 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -triple i386-unknown-freebsd %s
 // RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-unknown-freebsd %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64-scei-ps4 %s
 
 // Test FreeBSD kernel printf extensions.
 int freebsd_kernel_printf(const char *, ...) __attribute__((__format__(__freebsd_kprintf__, 1, 2)));
 
-void check_freebsd_kernel_extensions(int i, long l, char *s)
+void check_freebsd_kernel_extensions(int i, long l, char *s, short h)
 {
   // %b expects an int and a char *
   freebsd_kernel_printf("reg=%b\n", i, "\10\2BITTWO\1BITONE\n"); // no-warning
@@ -32,6 +33,12 @@ void check_freebsd_kernel_extensions(int i, long l, char *s)
   freebsd_kernel_printf("%lr", i); // expected-warning{{format specifies type 'long' but the argument has type 'int'}}
   freebsd_kernel_printf("%lr", l); // no-warning
 
+  // h modifier expects a short
+  freebsd_kernel_printf("%hr", i); // expected-warning{{format specifies type 'short' but the argument has type 'int'}}
+  freebsd_kernel_printf("%hr", h); // no-warning
+  freebsd_kernel_printf("%hy", i); // expected-warning{{format specifies type 'short' but the argument has type 'int'}}
+  freebsd_kernel_printf("%hy", h); // no-warning
+
   // %y expects an int
   freebsd_kernel_printf("%y", i); // no-warning
   freebsd_kernel_printf("%y", l); // expected-warning{{format specifies type 'int' but the argument has type 'long'}}
diff --git a/test/Sema/format-strings-scanf.c b/test/Sema/format-strings-scanf.c
index d3a03adf61959..7a92842b24541 100644
--- a/test/Sema/format-strings-scanf.c
+++ b/test/Sema/format-strings-scanf.c
@@ -18,7 +18,7 @@ int vfscanf(FILE * restrict, const char * restrict, va_list);
 int vsscanf(const char * restrict, const char * restrict, va_list);
 
 void test(const char *s, int *i) {
-  scanf(s, i); // expected-warning{{ormat string is not a string literal}}
+  scanf(s, i); // expected-warning{{format string is not a string literal}}
   scanf("%0d", i); // expected-warning{{zero field width in scanf format string is unused}}
   scanf("%00d", i); // expected-warning{{zero field width in scanf format string is unused}}
   scanf("%d%[asdfasdfd", i, s); // expected-warning{{no closing ']' for '%[' in scanf format string}}
@@ -171,3 +171,15 @@ void test_qualifiers(const int *cip, volatile int* vip,
   scanf("%d", (ip_t)0); // No warning.
   scanf("%d", (cip_t)0); // expected-warning{{format specifies type 'int *' but the argument has type 'cip_t' (aka 'const int *')}}
 }
+
+void check_conditional_literal(char *s, int *i) {
+  scanf(0 ? "%s" : "%d", i); // no warning
+  scanf(1 ? "%s" : "%d", i); // expected-warning{{format specifies type 'char *'}}
+  scanf(0 ? "%d %d" : "%d", i); // no warning
+  scanf(1 ? "%d %d" : "%d", i); // expected-warning{{more '%' conversions than data arguments}}
+  scanf(0 ? "%d %d" : "%d", i, s); // expected-warning{{data argument not used}}
+  scanf(1 ? "%d %s" : "%d", i, s); // no warning
+  scanf(i ? "%d %s" : "%d", i, s); // no warning
+  scanf(i ? "%d" : "%d", i, s); // expected-warning{{data argument not used}}
+  scanf(i ? "%s" : "%d", s); // expected-warning{{format specifies type 'int *'}}
+}
diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c
index 0d827e400d7e1..5559710c60355 100644
--- a/test/Sema/format-strings.c
+++ b/test/Sema/format-strings.c
@@ -29,15 +29,22 @@ void check_string_literal( FILE* fp, const char* s, char *buf, ... ) {
   va_start(ap,buf);
 
   printf(s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vprintf(s,ap); // expected-warning {{format string is not a string literal}}
   fprintf(fp,s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vfprintf(fp,s,ap); // expected-warning {{format string is not a string literal}}
   asprintf(&b,s); // expected-warning {{format string is not a string lit}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vasprintf(&b,s,ap); // expected-warning {{format string is not a string literal}}
   sprintf(buf,s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   snprintf(buf,2,s); // expected-warning {{format string is not a string lit}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   __builtin___sprintf_chk(buf,0,-1,s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   __builtin___snprintf_chk(buf,2,0,-1,s); // expected-warning {{format string is not a string lit}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vsprintf(buf,s,ap); // expected-warning {{format string is not a string lit}}
   vsnprintf(buf,2,s,ap); // expected-warning {{format string is not a string lit}}
   vsnprintf(buf,2,global_fmt,ap); // expected-warning {{format string is not a string literal}}
@@ -46,6 +53,9 @@ void check_string_literal( FILE* fp, const char* s, char *buf, ... ) {
 
   vscanf(s, ap); // expected-warning {{format string is not a string literal}}
 
+  const char *const fmt = "%d"; // FIXME -- defined here
+  printf(fmt, 1, 2); // expected-warning{{data argument not used}}
+
   // rdar://6079877
   printf("abc"
          "%*d", 1, 1); // no-warning
@@ -69,13 +79,18 @@ void check_string_literal2( FILE* fp, const char* s, char *buf, ... ) {
   va_start(ap,buf);
 
   printf(s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vprintf(s,ap); // no-warning
   fprintf(fp,s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vfprintf(fp,s,ap); // no-warning
   asprintf(&b,s); // expected-warning {{format string is not a string lit}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   vasprintf(&b,s,ap); // no-warning
   sprintf(buf,s); // expected-warning {{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   snprintf(buf,2,s); // expected-warning {{format string is not a string lit}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   __builtin___vsnprintf_chk(buf,2,0,-1,s,ap); // no-warning
 
   vscanf(s, ap); // expected-warning {{format string is not a string literal}}
@@ -85,7 +100,22 @@ void check_conditional_literal(const char* s, int i) {
   printf(i == 1 ? "yes" : "no"); // no-warning
   printf(i == 0 ? (i == 1 ? "yes" : "no") : "dont know"); // no-warning
   printf(i == 0 ? (i == 1 ? s : "no") : "dont know"); // expected-warning{{format string is not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   printf("yes" ?: "no %d", 1); // expected-warning{{data argument not used by format string}}
+  printf(0 ? "yes %s" : "no %d", 1); // no-warning
+  printf(0 ? "yes %d" : "no %s", 1); // expected-warning{{format specifies type 'char *'}}
+
+  printf(0 ? "yes" : "no %d", 1); // no-warning
+  printf(0 ? "yes %d" : "no", 1); // expected-warning{{data argument not used by format string}}
+  printf(1 ? "yes" : "no %d", 1); // expected-warning{{data argument not used by format string}}
+  printf(1 ? "yes %d" : "no", 1); // no-warning
+  printf(i ? "yes" : "no %d", 1); // no-warning
+  printf(i ? "yes %s" : "no %d", 1); // expected-warning{{format specifies type 'char *'}}
+  printf(i ? "yes" : "no %d", 1, 2); // expected-warning{{data argument not used by format string}}
+
+  printf(i ? "%*s" : "-", i, s); // no-warning
+  printf(i ? "yes" : 0 ? "no %*d" : "dont know %d", 1, 2); // expected-warning{{data argument not used by format string}}
+  printf(i ? "%i\n" : "%i %s %s\n", i, s); // expected-warning{{more '%' conversions than data arguments}}
 }
 
 void check_writeback_specifier()
@@ -185,8 +215,11 @@ void test_constant_bindings(void) {
   printf(s1); // no-warning
   printf(s2); // no-warning
   printf(s3); // expected-warning{{not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   printf(s4); // expected-warning{{not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   printf(s5); // expected-warning{{not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
 }
 
 
@@ -197,6 +230,7 @@ void test_constant_bindings(void) {
 void test9(char *P) {
   int x;
   printf(P);   // expected-warning {{format string is not a string literal (potentially insecure)}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
   printf(P, 42);
 }
 
@@ -519,7 +553,7 @@ void pr9751() {
 
   // Make sure that the "format string is defined here" note is not emitted
   // when the original string is within the argument expression.
-  printf(1 ? "yes %d" : "no %d"); // expected-warning 2{{more '%' conversions than data arguments}}
+  printf(1 ? "yes %d" : "no %d"); // expected-warning{{more '%' conversions than data arguments}}
 
   const char kFormat17[] = "%hu"; // expected-note{{format string is defined here}}}
   printf(kFormat17, (int[]){0}); // expected-warning{{format specifies type 'unsigned short' but the argument}}
@@ -615,5 +649,6 @@ extern void test_format_security_extra_args(const char*, int, ...)
     __attribute__((__format__(__printf__, 1, 3)));
 void test_format_security_pos(char* string) {
   test_format_security_extra_args(string, 5); // expected-warning {{format string is not a string literal (potentially insecure)}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
 }
 #pragma GCC diagnostic warning "-Wformat-nonliteral"
diff --git a/test/Sema/integer-overflow.c b/test/Sema/integer-overflow.c
index db5c1f4c7118d..e74bc119798b8 100644
--- a/test/Sema/integer-overflow.c
+++ b/test/Sema/integer-overflow.c
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple x86_64-pc-linux-gnu
 typedef unsigned long long uint64_t;
-typedef unsigned long long uint32_t;
+typedef unsigned int uint32_t;
+
+// Check integer sizes.
+int array64[sizeof(uint64_t) == 8 ? 1 : -1];
+int array32[sizeof(uint32_t) == 4 ? 1 : -1];
+int arrayint[sizeof(int) < sizeof(uint64_t) ? 1 : -1];
 
 uint64_t f0(uint64_t);
 uint64_t f1(uint64_t, uint32_t);
@@ -153,3 +158,23 @@ struct s {
   .y = 5,
   .x = 4 * 1024 * 1024 * 1024  // expected-warning {{overflow in expression; result is 0 with type 'int'}}
 };
+
+struct s2 {
+  unsigned a0;
+
+  struct s3 {
+    unsigned a2;
+
+    struct s4 {
+      unsigned a4;
+    } a3;
+  } a1;
+} s2 = {
+  .a0 = 4 * 1024 * 1024 * 1024, // expected-warning {{overflow in expression; result is 0 with type 'int'}}
+  {
+    .a2 = 4 * 1024 * 1024 * 1024, // expected-warning {{overflow in expression; result is 0 with type 'int'}}
+    {
+      .a4 = 4 * 1024 * 1024 * 1024 // expected-warning {{overflow in expression; result is 0 with type 'int'}}
+    }
+  }
+};
diff --git a/test/Sema/invalid-assignment-constant-address-space.c b/test/Sema/invalid-assignment-constant-address-space.c
index de2af64d00eca..77d6b331c2012 100644
--- a/test/Sema/invalid-assignment-constant-address-space.c
+++ b/test/Sema/invalid-assignment-constant-address-space.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
 
-#define OPENCL_CONSTANT 16776962
+#define OPENCL_CONSTANT 8388354
 int __attribute__((address_space(OPENCL_CONSTANT))) c[3] = {0};
 
 void foo() {
diff --git a/test/Sema/libbuiltins-ctype-powerpc64.c b/test/Sema/libbuiltins-ctype-powerpc64.c
new file mode 100644
index 0000000000000..bfd79acb0ab02
--- /dev/null
+++ b/test/Sema/libbuiltins-ctype-powerpc64.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple powerpc64-unknown-linux-gnu -emit-llvm < %s | FileCheck %s
+
+int isalnum(int);
+int isalpha(int);
+int isblank(int);
+int iscntrl(int);
+int isdigit(int);
+int isgraph(int);
+int islower(int);
+int isprint(int);
+int ispunct(int);
+int isspace(int);
+int isupper(int);
+int isxdigit(int);
+int tolower(int);
+int toupper(int);
+
+void test(int x) {
+  // CHECK: call signext i32 @isalnum(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalnum(x);
+  // CHECK: call signext i32 @isalpha(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalpha(x);
+  // CHECK: call signext i32 @isblank(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isblank(x);
+  // CHECK: call signext i32 @iscntrl(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)iscntrl(x);
+  // CHECK: call signext i32 @isdigit(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isdigit(x);
+  // CHECK: call signext i32 @isgraph(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isgraph(x);
+  // CHECK: call signext i32 @islower(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)islower(x);
+  // CHECK: call signext i32 @isprint(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isprint(x);
+  // CHECK: call signext i32 @ispunct(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)ispunct(x);
+  // CHECK: call signext i32 @isspace(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isspace(x);
+  // CHECK: call signext i32 @isupper(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isupper(x);
+  // CHECK: call signext i32 @isxdigit(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isxdigit(x);
+  // CHECK: call signext i32 @tolower(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)tolower(x);
+  // CHECK: call signext i32 @toupper(i32 signext {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)toupper(x);
+}
+
+// CHECK: declare signext i32 @isalnum(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isalpha(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isblank(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @iscntrl(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isdigit(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isgraph(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @islower(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isprint(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @ispunct(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isspace(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isupper(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @isxdigit(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @tolower(i32 signext) [[NUW_RO:#[0-9]+]]
+// CHECK: declare signext i32 @toupper(i32 signext) [[NUW_RO:#[0-9]+]]
+
+// CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/Sema/libbuiltins-ctype-x86_64.c b/test/Sema/libbuiltins-ctype-x86_64.c
new file mode 100644
index 0000000000000..4934e6f16752a
--- /dev/null
+++ b/test/Sema/libbuiltins-ctype-x86_64.c
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm < %s | FileCheck %s
+
+int isalnum(int);
+int isalpha(int);
+int isblank(int);
+int iscntrl(int);
+int isdigit(int);
+int isgraph(int);
+int islower(int);
+int isprint(int);
+int ispunct(int);
+int isspace(int);
+int isupper(int);
+int isxdigit(int);
+int tolower(int);
+int toupper(int);
+
+void test(int x) {
+  // CHECK: call i32 @isalnum(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalnum(x);
+  // CHECK: call i32 @isalpha(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isalpha(x);
+  // CHECK: call i32 @isblank(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isblank(x);
+  // CHECK: call i32 @iscntrl(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)iscntrl(x);
+  // CHECK: call i32 @isdigit(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isdigit(x);
+  // CHECK: call i32 @isgraph(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isgraph(x);
+  // CHECK: call i32 @islower(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)islower(x);
+  // CHECK: call i32 @isprint(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isprint(x);
+  // CHECK: call i32 @ispunct(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)ispunct(x);
+  // CHECK: call i32 @isspace(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isspace(x);
+  // CHECK: call i32 @isupper(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isupper(x);
+  // CHECK: call i32 @isxdigit(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)isxdigit(x);
+  // CHECK: call i32 @tolower(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)tolower(x);
+  // CHECK: call i32 @toupper(i32 {{%[0-9]+}}) [[NUW_RO_CALL:#[0-9]+]]
+  (void)toupper(x);
+}
+
+// CHECK: declare i32 @isalnum(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isalpha(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isblank(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @iscntrl(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isdigit(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isgraph(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @islower(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isprint(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @ispunct(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isspace(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isupper(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @isxdigit(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @tolower(i32) [[NUW_RO:#[0-9]+]]
+// CHECK: declare i32 @toupper(i32) [[NUW_RO:#[0-9]+]]
+
+// CHECK: attributes [[NUW_RO]] = { nounwind readonly{{.*}} }
+// CHECK: attributes [[NUW_RO_CALL]] = { nounwind readonly }
diff --git a/test/Sema/nonnull.c b/test/Sema/nonnull.c
index 9503e7c32a86b..e98a8194dbebd 100644
--- a/test/Sema/nonnull.c
+++ b/test/Sema/nonnull.c
@@ -86,7 +86,7 @@ void redecl_test(void *p) {
 
 // rdar://18712242
 #define NULL (void*)0
-__attribute__((__nonnull__))
+__attribute__((__nonnull__))  // expected-note 2{{declared 'nonnull' here}}
 int evil_nonnull_func(int* pointer, void * pv)
 {
    if (pointer == NULL) {  // expected-warning {{comparison of nonnull parameter 'pointer' equal to a null pointer is 'false' on first encounter}}
@@ -105,7 +105,7 @@ int evil_nonnull_func(int* pointer, void * pv)
 }
 
 void set_param_to_null(int**);
-int another_evil_nonnull_func(int* pointer, char ch, void * pv) __attribute__((nonnull(1, 3)));
+int another_evil_nonnull_func(int* pointer, char ch, void * pv) __attribute__((nonnull(1, 3)));  // expected-note 2{{declared 'nonnull' here}}
 int another_evil_nonnull_func(int* pointer, char ch, void * pv) {
    if (pointer == NULL) { // expected-warning {{comparison of nonnull parameter 'pointer' equal to a null pointer is 'false' on first encounter}}
      return 0;
@@ -127,7 +127,7 @@ extern void FOO();
 extern void FEE();
 
 extern void *pv;
-__attribute__((__nonnull__))
+__attribute__((__nonnull__))  // expected-note {{declared 'nonnull' here}}
 void yet_another_evil_nonnull_func(int* pointer)
 {
  while (pv) {
@@ -141,7 +141,7 @@ void yet_another_evil_nonnull_func(int* pointer)
  }
 }
 
-void pr21668_1(__attribute__((nonnull)) const char *p, const char *s) {
+void pr21668_1(__attribute__((nonnull)) const char *p, const char *s) { // expected-note {{declared 'nonnull' here}}
   if (p) // expected-warning {{nonnull parameter 'p' will evaluate to 'true' on first encounter}}
     ;
   if (s) // No warning
@@ -154,7 +154,7 @@ void pr21668_2(__attribute__((nonnull)) const char *p) {
     ;
 }
 
-__attribute__((returns_nonnull)) void *returns_nonnull_whee();
+__attribute__((returns_nonnull)) void *returns_nonnull_whee();  // expected-note 6{{declared 'returns_nonnull' here}}
 
 void returns_nonnull_warning_tests() {
   if (returns_nonnull_whee() == NULL) {} // expected-warning {{comparison of nonnull function call 'returns_nonnull_whee()' equal to a null pointer is 'false' on first encounter}}
diff --git a/test/Sema/nullability.c b/test/Sema/nullability.c
index bbe5cb4143a01..71e12734d1d26 100644
--- a/test/Sema/nullability.c
+++ b/test/Sema/nullability.c
@@ -8,7 +8,11 @@
 typedef int * int_ptr;
 
 // Parse nullability type specifiers.
-typedef int * _Nonnull nonnull_int_ptr; // expected-note{{'_Nonnull' specified here}}
+// This note requires C11.
+#if __STDC_VERSION__ > 199901L
+// expected-note@+2{{'_Nonnull' specified here}}
+#endif
+typedef int * _Nonnull nonnull_int_ptr;
 typedef int * _Nullable nullable_int_ptr;
 typedef int * _Null_unspecified null_unspecified_int_ptr;
 
@@ -23,9 +27,14 @@ typedef int * _Null_unspecified _Nonnull conflicting_2; // expected-error{{nulla
 typedef nonnull_int_ptr _Nonnull redundant_okay_1;
 
 // Conflicting nullability specifiers via a typedef are not.
+// Some of these errors require C11.
+#if __STDC_VERSION__ > 199901L
 typedef nonnull_int_ptr _Nullable conflicting_2; // expected-error{{nullability specifier '_Nullable' conflicts with existing specifier '_Nonnull'}}
+#endif
 typedef nonnull_int_ptr nonnull_int_ptr_typedef;
+#if __STDC_VERSION__ > 199901L
 typedef nonnull_int_ptr_typedef _Nullable conflicting_2; // expected-error{{nullability specifier '_Nullable' conflicts with existing specifier '_Nonnull'}}
+#endif
 typedef nonnull_int_ptr_typedef nonnull_int_ptr_typedef_typedef;
 typedef nonnull_int_ptr_typedef_typedef _Null_unspecified conflicting_3; // expected-error{{nullability specifier '_Null_unspecified' conflicts with existing specifier '_Nonnull'}}
 
@@ -69,8 +78,11 @@ typedef _Nonnull int * _Nullable *  conflict_int_ptr_ptr_2; // expected-error{{n
 
 // Nullability is not part of the canonical type.
 typedef int * _Nonnull ambiguous_int_ptr;
+// Redefining a typedef is a C11 feature.
+#if __STDC_VERSION__ > 199901L
 typedef int * ambiguous_int_ptr;
 typedef int * _Nullable ambiguous_int_ptr;
+#endif
 
 // Printing of nullability.
 float f;
diff --git a/test/Sema/overloadable.c b/test/Sema/overloadable.c
index 3120649dbc023..5d95a317fa388 100644
--- a/test/Sema/overloadable.c
+++ b/test/Sema/overloadable.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -Wincompatible-pointer-types
 
 int var __attribute__((overloadable)); // expected-error{{'overloadable' attribute only applies to functions}}
 void params(void) __attribute__((overloadable(12))); // expected-error {{'overloadable' attribute takes no arguments}}
@@ -99,3 +99,26 @@ void conversions() {
   unsigned char *c;
   multi_type(c);
 }
+
+// Ensure that we allow C-specific type conversions in C
+void fn_type_conversions() {
+  void foo(void *c) __attribute__((overloadable));
+  void foo(char *c) __attribute__((overloadable));
+  void (*ptr1)(void *) = &foo;
+  void (*ptr2)(char *) = &foo;
+  void (*ambiguous)(int *) = &foo; // expected-error{{initializing 'void (*)(int *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@105{{candidate function}} expected-note@106{{candidate function}}
+  void *vp_ambiguous = &foo; // expected-error{{initializing 'void *' with an expression of incompatible type '<overloaded function type>'}} expected-note@105{{candidate function}} expected-note@106{{candidate function}}
+
+  void (*specific1)(int *) = (void (*)(void *))&foo; // expected-warning{{incompatible pointer types initializing 'void (*)(int *)' with an expression of type 'void (*)(void *)'}}
+  void *specific2 = (void (*)(void *))&foo;
+
+  void disabled(void *c) __attribute__((overloadable, enable_if(0, "")));
+  void disabled(int *c) __attribute__((overloadable, enable_if(c, "")));
+  void disabled(char *c) __attribute__((overloadable, enable_if(1, "The function name lies.")));
+  // To be clear, these should all point to the last overload of 'disabled'
+  void (*dptr1)(char *c) = &disabled;
+  void (*dptr2)(void *c) = &disabled; // expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@115{{candidate function made ineligible by enable_if}} expected-note@116{{candidate function made ineligible by enable_if}} expected-note@117{{candidate function has type mismatch at 1st parameter (expected 'void *' but has 'char *')}}
+  void (*dptr3)(int *c) = &disabled; // expected-warning{{incompatible pointer types initializing 'void (*)(int *)' with an expression of type '<overloaded function type>'}} expected-note@115{{candidate function made ineligible by enable_if}} expected-note@116{{candidate function made ineligible by enable_if}} expected-note@117{{candidate function has type mismatch at 1st parameter (expected 'int *' but has 'char *')}}
+
+  void *specific_disabled = &disabled;
+}
diff --git a/test/Sema/pass-object-size.c b/test/Sema/pass-object-size.c
index 6f375c0e94d55..ddfbbd5fc4e11 100644
--- a/test/Sema/pass-object-size.c
+++ b/test/Sema/pass-object-size.c
@@ -38,8 +38,8 @@ void FunctionPtrs() {
   void (*p)(void *) = NotOverloaded; //expected-error{{cannot take address of function 'NotOverloaded' because parameter 1 has pass_object_size attribute}}
   void (*p2)(void *) = &NotOverloaded; //expected-error{{cannot take address of function 'NotOverloaded' because parameter 1 has pass_object_size attribute}}
 
-  void (*p3)(void *) = IsOverloaded; //expected-error{{initializing 'void (*)(void *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@-6{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-5{{type mismatch}}
-  void (*p4)(void *) = &IsOverloaded; //expected-error{{initializing 'void (*)(void *)' with an expression of incompatible type '<overloaded function type>'}} expected-note@-7{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-6{{type mismatch}}
+  void (*p3)(void *) = IsOverloaded; //expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@-6{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-5{{type mismatch}}
+  void (*p4)(void *) = &IsOverloaded; //expected-warning{{incompatible pointer types initializing 'void (*)(void *)' with an expression of type '<overloaded function type>'}} expected-note@-7{{candidate address cannot be taken because parameter 1 has pass_object_size attribute}} expected-note@-6{{type mismatch}}
 
   void (*p5)(char *) = IsOverloaded;
   void (*p6)(char *) = &IsOverloaded;
diff --git a/test/Sema/pr25786.c b/test/Sema/pr25786.c
new file mode 100644
index 0000000000000..2ce65311a24fe
--- /dev/null
+++ b/test/Sema/pr25786.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -DTEST -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i686-unknown-linux-gnu -fsyntax-only -verify %s
+
+#if TEST
+void (__attribute__((regparm(3), stdcall)) *pf) (); //expected-warning {{calling convention 'stdcall' ignored for this target}}
+void (__attribute__((regparm(2), stdcall)) foo)(int a) { //expected-warning {{calling convention 'stdcall' ignored for this target}}
+}
+#else
+//expected-no-diagnostics
+void (__attribute__((regparm(3), stdcall)) *pf) ();
+void (__attribute__((regparm(2), stdcall)) foo)(int a) {}
+#endif
diff --git a/test/Sema/predefined-function.c b/test/Sema/predefined-function.c
index 1c40b6e8c2c0e..aa7b28535bcd5 100644
--- a/test/Sema/predefined-function.c
+++ b/test/Sema/predefined-function.c
@@ -4,14 +4,13 @@ char *funk(int format);
 enum Test {A=-1};
 char *funk(enum Test x);
 
-int eli(float b); // expected-note {{previous declaration is here}} \
-// expected-note{{passing argument to parameter 'b' here}}
+int eli(float b); // expected-note {{previous declaration is here}}
 int b(int c) {return 1;}
 
 int foo();
 int foo() {
   int eli(int (int)); // expected-error {{conflicting types for 'eli'}}
-  eli(b); // expected-error{{passing 'int (int)' to parameter of incompatible type 'float'}}
+  eli(b);
   return 0;
 }
 
diff --git a/test/Sema/preserve-call-conv.c b/test/Sema/preserve-call-conv.c
new file mode 100644
index 0000000000000..f258f45ac5826
--- /dev/null
+++ b/test/Sema/preserve-call-conv.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 %s -fsyntax-only -triple x86_64-unknown-unknown -verify
+// RUN: %clang_cc1 %s -fsyntax-only -triple arm64-unknown-unknown -verify
+typedef void typedef_fun_t(int);
+
+void __attribute__((preserve_most)) foo(void *ptr) {
+}
+
+void __attribute__((preserve_most(1))) foo1(void *ptr) { // expected-error {{'preserve_most' attribute takes no arguments}}
+}
+
+void (__attribute__((preserve_most)) *pfoo1)(void *) = foo;
+
+void (__attribute__((cdecl)) *pfoo2)(void *) = foo; // expected-warning {{incompatible pointer types initializing 'void (*)(void *) __attribute__((cdecl))' with an expression of type 'void (void *) __attribute__((preserve_most))'}}
+void (*pfoo3)(void *) = foo; // expected-warning {{incompatible pointer types initializing 'void (*)(void *)' with an expression of type 'void (void *) __attribute__((preserve_most))'}}
+
+typedef_fun_t typedef_fun_foo; // expected-note {{previous declaration is here}}
+void __attribute__((preserve_most)) typedef_fun_foo(int x) { } // expected-error {{function declared 'preserve_most' here was previously declared without calling convention}}
+
+struct type_test_foo {} __attribute__((preserve_most));  // expected-warning {{'preserve_most' attribute only applies to functions and methods}}
+
+void __attribute__((preserve_all)) boo(void *ptr) {
+}
+
+void __attribute__((preserve_all(1))) boo1(void *ptr) { // expected-error {{'preserve_all' attribute takes no arguments}}
+}
+
+void (__attribute__((preserve_all)) *pboo1)(void *) = boo;
+
+void (__attribute__((cdecl)) *pboo2)(void *) = boo; // expected-warning {{incompatible pointer types initializing 'void (*)(void *) __attribute__((cdecl))' with an expression of type 'void (void *) __attribute__((preserve_all))'}}
+void (*pboo3)(void *) = boo; // expected-warning {{incompatible pointer types initializing 'void (*)(void *)' with an expression of type 'void (void *) __attribute__((preserve_all))'}}
+
+typedef_fun_t typedef_fun_boo; // expected-note {{previous declaration is here}}
+void __attribute__((preserve_all)) typedef_fun_boo(int x) { } // expected-error {{function declared 'preserve_all' here was previously declared without calling convention}}
+
+struct type_test_boo {} __attribute__((preserve_all));  // expected-warning {{'preserve_all' attribute only applies to functions and methods}}
diff --git a/test/Sema/renderscript.rs b/test/Sema/renderscript.rs
new file mode 100644
index 0000000000000..80be5ae424f8c
--- /dev/null
+++ b/test/Sema/renderscript.rs
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -x renderscript -D__RENDERSCRIPT__ %s
+// RUN: %clang_cc1 -fsyntax-only -verify -x c %s
+
+#ifndef __RENDERSCRIPT__
+// expected-warning@+2 {{'kernel' attribute ignored}}
+#endif
+void __attribute__((kernel)) kernel() {}
+
+#ifndef __RENDERSCRIPT__
+// expected-warning@+4 {{'kernel' attribute ignored}}
+#else
+// expected-warning@+2 {{'kernel' attribute only applies to functions}}
+#endif
+int __attribute__((kernel)) global;
+
+#ifndef __RENDERSCRIPT__
+// expected-error@+2 {{function return value cannot have __fp16 type; did you forget * ?}}
+#endif
+__fp16 fp16_return();
+
+#ifndef __RENDERSCRIPT__
+// expected-error@+2 {{parameters cannot have __fp16 type; did you forget * ?}}
+#endif
+void fp16_arg(__fp16 p);
diff --git a/test/Sema/typo-correction.c b/test/Sema/typo-correction.c
index 4ef50570899c2..a1107a25ee54b 100644
--- a/test/Sema/typo-correction.c
+++ b/test/Sema/typo-correction.c
@@ -55,3 +55,13 @@ void fn2() {
   f(THIS_IS_AN_ERROR, // expected-error {{use of undeclared identifier 'THIS_IS_AN_ERROR'}}
     afunction(afunction_));  // expected-error {{use of undeclared identifier 'afunction_'; did you mean 'afunction'?}}
 }
+
+int d = X ? d : L; // expected-error 2 {{use of undeclared identifier}}
+
+int fn_with_ids() { ID = ID == ID >= ID ; } // expected-error 4 {{use of undeclared identifier}}
+
+int fn_with_rs(int r) { r = TYPO + r * TYPO; } // expected-error 2 {{use of undeclared identifier}}
+
+void fn_with_unknown(int a, int b) {
+  fn_with_unknown(unknown, unknown | unknown); // expected-error 3 {{use of undeclared identifier}}
+}
diff --git a/test/Sema/unused-expr.c b/test/Sema/unused-expr.c
index 09359687d5324..58ad8278f2011 100644
--- a/test/Sema/unused-expr.c
+++ b/test/Sema/unused-expr.c
@@ -76,7 +76,7 @@ void t4(int a) {
 // rdar://7186119
 int t5f(void) __attribute__((warn_unused_result));
 void t5() {
-  t5f();   // expected-warning {{ignoring return value of function declared with warn_unused_result}}
+  t5f();   // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 
@@ -88,11 +88,11 @@ int t6() {
   if (fn1() < 0 || fn2(2,1) < 0 || fn3(2) < 0)  // no warnings
     return -1;
 
-  fn1();  // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  fn1();  // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
   fn2(92, 21);  // expected-warning {{ignoring return value of function declared with pure attribute}}
   fn3(42);  // expected-warning {{ignoring return value of function declared with const attribute}}
   __builtin_abs(0); // expected-warning {{ignoring return value of function declared with const attribute}}
-  (void)0, fn1();  // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  (void)0, fn1();  // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
   return 0;
 }
 
@@ -101,7 +101,7 @@ int t7 __attribute__ ((warn_unused_result)); // expected-warning {{'warn_unused_
 // PR4010
 int (*fn4)(void) __attribute__ ((warn_unused_result));
 void t8() {
-  fn4(); // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  fn4(); // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 void t9() __attribute__((warn_unused_result)); // expected-warning {{attribute 'warn_unused_result' cannot be applied to functions without return value}}
diff --git a/test/Sema/varargs-x86-64.c b/test/Sema/varargs-x86-64.c
index d50dd6a6fc194..0929c0d914787 100644
--- a/test/Sema/varargs-x86-64.c
+++ b/test/Sema/varargs-x86-64.c
@@ -21,16 +21,16 @@ void __attribute__((ms_abi)) g1(int a) {
 void __attribute__((ms_abi)) g2(int a, int b, ...) {
   __builtin_ms_va_list ap;
 
-  __builtin_ms_va_start(ap, 10); // expected-warning {{second parameter of 'va_start' not last named argument}}
-  __builtin_ms_va_start(ap, a); // expected-warning {{second parameter of 'va_start' not last named argument}}
+  __builtin_ms_va_start(ap, 10); // expected-warning {{second argument to 'va_start' is not the last named parameter}}
+  __builtin_ms_va_start(ap, a); // expected-warning {{second argument to 'va_start' is not the last named parameter}}
   __builtin_ms_va_start(ap, b);
 }
 
-void __attribute__((ms_abi)) g3(float a, ...) {
+void __attribute__((ms_abi)) g3(float a, ...) { // expected-note 2{{parameter of type 'float' is declared here}}
   __builtin_ms_va_list ap;
 
-  __builtin_ms_va_start(ap, a);
-  __builtin_ms_va_start(ap, (a));
+  __builtin_ms_va_start(ap, a); // expected-warning {{passing an object that undergoes default argument promotion to 'va_start' has undefined behavior}}
+  __builtin_ms_va_start(ap, (a)); // expected-warning {{passing an object that undergoes default argument promotion to 'va_start' has undefined behavior}}
 }
 
 void __attribute__((ms_abi)) g5() {
diff --git a/test/Sema/varargs.c b/test/Sema/varargs.c
index 5329c2e61c98a..457d84c212f7d 100644
--- a/test/Sema/varargs.c
+++ b/test/Sema/varargs.c
@@ -4,7 +4,7 @@
 void f1(int a)
 {
     __builtin_va_list ap;
-    
+
     __builtin_va_start(ap, a, a); // expected-error {{too many arguments to function}}
     __builtin_va_start(ap, a); // expected-error {{'va_start' used in function with fixed args}}
 }
@@ -12,18 +12,17 @@ void f1(int a)
 void f2(int a, int b, ...)
 {
     __builtin_va_list ap;
-    
-    __builtin_va_start(ap, 10); // expected-warning {{second parameter of 'va_start' not last named argument}}
-    __builtin_va_start(ap, a); // expected-warning {{second parameter of 'va_start' not last named argument}}
+
+    __builtin_va_start(ap, 10); // expected-warning {{second argument to 'va_start' is not the last named parameter}}
+    __builtin_va_start(ap, a); // expected-warning {{second argument to 'va_start' is not the last named parameter}}
     __builtin_va_start(ap, b);
 }
 
-void f3(float a, ...)
-{
+void f3(float a, ...) { // expected-note 2{{parameter of type 'float' is declared here}}
     __builtin_va_list ap;
-    
-    __builtin_va_start(ap, a);
-    __builtin_va_start(ap, (a));
+
+    __builtin_va_start(ap, a); // expected-warning {{passing an object that undergoes default argument promotion to 'va_start' has undefined behavior}}
+    __builtin_va_start(ap, (a)); // expected-warning {{passing an object that undergoes default argument promotion to 'va_start' has undefined behavior}}
 }
 
 
@@ -83,3 +82,15 @@ void f10(int a, ...) {
   i = __builtin_va_start(ap, a); // expected-error {{assigning to 'int' from incompatible type 'void'}}
   __builtin_va_end(ap);
 }
+
+void f11(short s, ...) {  // expected-note {{parameter of type 'short' is declared here}}
+  __builtin_va_list ap;
+  __builtin_va_start(ap, s); // expected-warning {{passing an object that undergoes default argument promotion to 'va_start' has undefined behavior}}
+  __builtin_va_end(ap);
+}
+
+void f12(register int i, ...) {  // expected-note {{parameter of type 'int' is declared here}}
+  __builtin_va_list ap;
+  __builtin_va_start(ap, i); // expected-warning {{passing a parameter declared with the 'register' keyword to 'va_start' has undefined behavior}}
+  __builtin_va_end(ap);
+}
diff --git a/test/Sema/varargs.cpp b/test/Sema/varargs.cpp
deleted file mode 100644
index 48a7b2fdf1036..0000000000000
--- a/test/Sema/varargs.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-
-class string;
-void f(const string& s, ...) {  // expected-note {{parameter of type 'const string &' is declared here}}
-  __builtin_va_list ap;
-  __builtin_va_start(ap, s); // expected-warning {{'va_start' has undefined behavior with reference types}}
-}
diff --git a/test/Sema/vector-cast.c b/test/Sema/vector-cast.c
index 03db5408c458f..c0382892b6996 100644
--- a/test/Sema/vector-cast.c
+++ b/test/Sema/vector-cast.c
@@ -45,12 +45,25 @@ void f3(t3 Y) {
 }
 
 typedef float float2 __attribute__ ((vector_size (8)));
+typedef __attribute__((vector_size(8))) double float64x1_t;
+typedef __attribute__((vector_size(16))) double float64x2_t;
+float64x1_t vget_low_f64(float64x2_t __p0);
 
 void f4() {
   float2 f2;
-  double d;
+  double d, a, b, c;
+  float64x2_t v = {0.0, 1.0};
   f2 += d;
-  d += f2;
+  a = 3.0 + vget_low_f64(v);
+  b = vget_low_f64(v) + 3.0;
+  c = vget_low_f64(v);
+  // LAX conversions within compound assignments are not supported.
+  // FIXME: This diagnostic is inaccurate.
+  d += f2; // expected-error {{cannot convert between vector values of different size}}
+  c -= vget_low_f64(v); // expected-error {{cannot convert between vector values of different size}}
+  // LAX conversions between scalar and vector types require same size and one element sized vectors.
+  d = f2; // expected-error {{assigning to 'double' from incompatible type 'float2'}}
+  d = d + f2; // expected-error {{assigning to 'double' from incompatible type 'float2'}}
 }
 
 // rdar://15931426
diff --git a/test/Sema/warn-double-promotion.c b/test/Sema/warn-double-promotion.c
index b6fd0c5ec629b..0cf33e84b427b 100644
--- a/test/Sema/warn-double-promotion.c
+++ b/test/Sema/warn-double-promotion.c
@@ -24,7 +24,7 @@ long double ReturnLongDoubleFromDouble(double d) {
   return d;  //expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
 }
 
-void Convert(float f, double d, long double ld) {
+void Assignment(float f, double d, long double ld) {
   d = f;  //expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
   ld = f; //expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
   ld = d; //expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
@@ -32,3 +32,43 @@ void Convert(float f, double d, long double ld) {
   f = ld;
   d = ld;
 }
+
+extern void DoubleParameter(double);
+extern void LongDoubleParameter(long double);
+
+void ArgumentPassing(float f, double d) {
+  DoubleParameter(f); // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  LongDoubleParameter(f); // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  LongDoubleParameter(d); // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+}
+
+void BinaryOperator(float f, double d, long double ld) {
+  f = f * d; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  f = d * f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  f = f * ld; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  f = ld * f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  d = d * ld; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+  d = ld * d; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+}
+
+void MultiplicationAssignment(float f, double d, long double ld) {
+  d *= f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'double'}}
+  ld *= f; // expected-warning{{implicit conversion increases floating-point precision: 'float' to 'long double'}}
+  ld *= d; // expected-warning{{implicit conversion increases floating-point precision: 'double' to 'long double'}}
+
+  // FIXME: These cases should produce warnings as above.
+  f *= d;
+  f *= ld;
+  d *= ld;
+}
+
+// FIXME: As with a binary operator, the operands to the conditional operator are
+// converted to a common type and should produce a warning.
+void ConditionalOperator(float f, double d, long double ld, int i) {
+  f = i ? f : d;
+  f = i ? d : f;
+  f = i ? f : ld;
+  f = i ? ld : f;
+  d = i ? d : ld;
+  d = i ? ld : d;
+}
diff --git a/test/Sema/wchar.c b/test/Sema/wchar.c
index 9e41f5329f9a1..74151edede03d 100644
--- a/test/Sema/wchar.c
+++ b/test/Sema/wchar.c
@@ -4,7 +4,7 @@
 typedef __WCHAR_TYPE__ wchar_t;
 
 #if defined(_WIN32) || defined(_M_IX86) || defined(__CYGWIN__) \
- || defined(_M_X64) || defined(__PS4__) || defined(SHORT_WCHAR)
+ || defined(_M_X64) || defined(__ORBIS__) || defined(SHORT_WCHAR)
   #define WCHAR_T_TYPE unsigned short
 #elif defined(__arm) || defined(__aarch64__)
   #define WCHAR_T_TYPE unsigned int
diff --git a/test/Sema/xray-always-instrument-attr.c b/test/Sema/xray-always-instrument-attr.c
new file mode 100644
index 0000000000000..3c063e21a68fe
--- /dev/null
+++ b/test/Sema/xray-always-instrument-attr.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c11
+void foo() __attribute__((xray_always_instrument));
+
+struct __attribute__((xray_always_instrument)) a { int x; }; // expected-warning {{'xray_always_instrument' attribute only applies to functions and methods}}
+
+void bar() __attribute__((xray_always_instrument("not-supported"))); // expected-error {{'xray_always_instrument' attribute takes no arguments}}
diff --git a/test/Sema/xray-always-instrument-attr.cpp b/test/Sema/xray-always-instrument-attr.cpp
new file mode 100644
index 0000000000000..8d42837ec6bff
--- /dev/null
+++ b/test/Sema/xray-always-instrument-attr.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c++11 -x c++
+void foo [[clang::xray_always_instrument]] ();
+
+struct [[clang::xray_always_instrument]] a { int x; }; // expected-warning {{'xray_always_instrument' attribute only applies to functions and methods}}
+
+class b {
+ void c [[clang::xray_always_instrument]] ();
+};
+
+void baz [[clang::xray_always_instrument("not-supported")]] (); // expected-error {{'xray_always_instrument' attribute takes no arguments}}
diff --git a/test/SemaCUDA/Inputs/cuda-initializers.h b/test/SemaCUDA/Inputs/cuda-initializers.h
new file mode 100644
index 0000000000000..837b726a13e0f
--- /dev/null
+++ b/test/SemaCUDA/Inputs/cuda-initializers.h
@@ -0,0 +1,145 @@
+// CUDA struct types with interesting initialization properties.
+// Keep in sync with ../CodeGenCUDA/Inputs/cuda-initializers.h.
+
+// Base classes with different initializer variants.
+
+// trivial constructor -- allowed
+struct T {
+  int t;
+};
+
+// empty constructor
+struct EC {
+  int ec;
+  __device__ EC() {}     // -- allowed
+  __device__ EC(int) {}  // -- not allowed
+};
+
+// empty destructor
+struct ED {
+  __device__ ~ED() {}     // -- allowed
+};
+
+struct ECD {
+  __device__ ECD() {}     // -- allowed
+  __device__ ~ECD() {}    // -- allowed
+};
+
+// empty templated constructor -- allowed with no arguments
+struct ETC {
+  template <typename... T> __device__ ETC(T...) {}
+};
+
+// undefined constructor -- not allowed
+struct UC {
+  int uc;
+  __device__ UC();
+};
+
+// undefined destructor -- not allowed
+struct UD {
+  int ud;
+  __device__ ~UD();
+};
+
+// empty constructor w/ initializer list -- not allowed
+struct ECI {
+  int eci;
+  __device__ ECI() : eci(1) {}
+};
+
+// non-empty constructor -- not allowed
+struct NEC {
+  int nec;
+  __device__ NEC() { nec = 1; }
+};
+
+// non-empty destructor -- not allowed
+struct NED {
+  int ned;
+  __device__ ~NED() { ned = 1; }
+};
+
+// no-constructor,  virtual method -- not allowed
+struct NCV {
+  int ncv;
+  __device__ virtual void vm() {}
+};
+
+// virtual destructor -- not allowed.
+struct VD {
+  __device__ virtual ~VD() {}
+};
+
+// dynamic in-class field initializer -- not allowed
+__device__ int f();
+struct NCF {
+  int ncf = f();
+};
+
+// static in-class field initializer.  NVCC does not allow it, but
+// clang generates static initializer for this, so we'll accept it.
+// We still can't use it on __shared__ vars as they don't allow *any*
+// initializers.
+struct NCFS {
+  int ncfs = 3;
+};
+
+// undefined templated constructor -- not allowed
+struct UTC {
+  template <typename... T> __device__ UTC(T...);
+};
+
+// non-empty templated constructor -- not allowed
+struct NETC {
+  int netc;
+  template <typename... T> __device__ NETC(T...) { netc = 1; }
+};
+
+// Regular base class -- allowed
+struct T_B_T : T {};
+
+// Incapsulated object of allowed class -- allowed
+struct T_F_T {
+  T t;
+};
+
+// array of allowed objects -- allowed
+struct T_FA_T {
+  T t[2];
+};
+
+
+// Calling empty base class initializer is OK
+struct EC_I_EC : EC {
+  __device__ EC_I_EC() : EC() {}
+};
+
+// .. though passing arguments is not allowed.
+struct EC_I_EC1 : EC {
+  __device__ EC_I_EC1() : EC(1) {}
+};
+
+// Virtual base class -- not allowed
+struct T_V_T : virtual T {};
+
+// Inherited from or incapsulated class with non-empty constructor --
+// not allowed
+struct T_B_NEC : NEC {};
+struct T_F_NEC {
+  NEC nec;
+};
+struct T_FA_NEC {
+  NEC nec[2];
+};
+
+
+// Inherited from or incapsulated class with non-empty desstructor --
+// not allowed
+struct T_B_NED : NED {};
+struct T_F_NED {
+  NED ned;
+};
+struct T_FA_NED {
+  NED ned[2];
+};
diff --git a/test/SemaCUDA/Inputs/overload.h b/test/SemaCUDA/Inputs/overload.h
new file mode 100644
index 0000000000000..1c021f1ec572c
--- /dev/null
+++ b/test/SemaCUDA/Inputs/overload.h
@@ -0,0 +1,8 @@
+// This header is used by tests which are interested in __device__ functions
+// which appear in a system header.
+
+__device__ int OverloadMe();
+
+namespace ns {
+using ::OverloadMe;
+}
diff --git a/test/SemaCUDA/addr-of-overloaded-fn.cu b/test/SemaCUDA/addr-of-overloaded-fn.cu
new file mode 100644
index 0000000000000..03c7f7c3bd5b7
--- /dev/null
+++ b/test/SemaCUDA/addr-of-overloaded-fn.cu
@@ -0,0 +1,24 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+__host__ void overload() {}
+__device__ void overload() {}
+
+__host__ __device__ void test_hd() {
+  // This should not be ambiguous -- we choose the host or the device overload
+  // depending on whether or not we're compiling for host or device.
+  void (*x)() = overload;
+}
+
+// These also shouldn't be ambiguous, but they're an easier test than the HD
+// function above.
+__host__ void test_host() {
+  void (*x)() = overload;
+}
+__device__ void test_device() {
+  void (*x)() = overload;
+}
diff --git a/test/SemaCUDA/alias.cu b/test/SemaCUDA/alias.cu
new file mode 100644
index 0000000000000..39251ed10ecda
--- /dev/null
+++ b/test/SemaCUDA/alias.cu
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fsyntax-only -fcuda-is-device -verify -DEXPECT_ERR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+
+// The alias attribute is not allowed in CUDA device code.
+void bar();
+__attribute__((alias("bar"))) void foo();
+#ifdef EXPECT_ERR
+// expected-error@-2 {{CUDA does not support aliases}}
+#else
+// expected-no-diagnostics
+#endif
diff --git a/test/SemaCUDA/bad-attributes.cu b/test/SemaCUDA/bad-attributes.cu
index 7e01e141de1d2..4cb43e240b049 100644
--- a/test/SemaCUDA/bad-attributes.cu
+++ b/test/SemaCUDA/bad-attributes.cu
@@ -4,8 +4,8 @@
 //
 // You should be able to run this file through nvcc for compatibility testing.
 //
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wcuda-compat -verify -DEXPECT_INLINE_WARNING %s
+// RUN: %clang_cc1 -fcuda-is-device -fsyntax-only -Wcuda-compat -verify %s
 
 #include "Inputs/cuda.h"
 
@@ -47,3 +47,15 @@ __global__ __device__ void z11();  // expected-error {{attributes are not compat
 // expected-note@-1 {{conflicting attribute is here}}
 __global__ __host__ void z12();  // expected-error {{attributes are not compatible}}
 // expected-note@-1 {{conflicting attribute is here}}
+
+struct S {
+  __global__ void foo() {};  // expected-error {{must be a free function or static member function}}
+  __global__ static void bar(); // expected-warning {{kernel function 'bar' is a member function}}
+  // Although this is implicitly inline, we shouldn't warn.
+  __global__ static void baz() {}; // expected-warning {{kernel function 'baz' is a member function}}
+};
+
+__global__ static inline void foobar() {};
+#ifdef EXPECT_INLINE_WARNING
+// expected-warning@-2 {{ignored 'inline' attribute on kernel function 'foobar'}}
+#endif
diff --git a/test/SemaCUDA/builtins.cu b/test/SemaCUDA/builtins.cu
index 32b575862cfef..814fda2ac7d34 100644
--- a/test/SemaCUDA/builtins.cu
+++ b/test/SemaCUDA/builtins.cu
@@ -7,10 +7,10 @@
 // REQUIRES: nvptx-registered-target
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown \
 // RUN:     -aux-triple nvptx64-unknown-cuda \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
+// RUN:     -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple nvptx64-unknown-cuda -fcuda-is-device \
 // RUN:     -aux-triple x86_64-unknown-unknown \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
+// RUN:     -fsyntax-only -verify %s
 
 #if !(defined(__amd64__) && defined(__PTX__))
 #error "Expected to see preprocessor macros from both sides of compilation."
@@ -18,13 +18,13 @@
 
 void hf() {
   int x = __builtin_ia32_rdtsc();
-  int y = __builtin_ptx_read_tid_x(); // expected-note  {{'__builtin_ptx_read_tid_x' declared here}}
-  // expected-error@-1 {{reference to __device__ function '__builtin_ptx_read_tid_x' in __host__ function}}
+  int y = __nvvm_read_ptx_sreg_tid_x(); // expected-note  {{'__nvvm_read_ptx_sreg_tid_x' declared here}}
+  // expected-error@-1 {{reference to __device__ function '__nvvm_read_ptx_sreg_tid_x' in __host__ function}}
   x = __builtin_abs(1);
 }
 
 __attribute__((device)) void df() {
-  int x = __builtin_ptx_read_tid_x();
+  int x = __nvvm_read_ptx_sreg_tid_x();
   int y = __builtin_ia32_rdtsc(); // expected-error {{reference to __host__ function '__builtin_ia32_rdtsc' in __device__ function}}
                                   // expected-note@20 {{'__builtin_ia32_rdtsc' declared here}}
   x = __builtin_abs(1);
diff --git a/test/SemaCUDA/call-overloaded-destructor.cu b/test/SemaCUDA/call-overloaded-destructor.cu
new file mode 100644
index 0000000000000..24b0e7d330ea1
--- /dev/null
+++ b/test/SemaCUDA/call-overloaded-destructor.cu
@@ -0,0 +1,17 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+struct S {
+  __host__ ~S() {}
+  __device__ ~S() {}
+};
+
+__host__ __device__ void test() {
+  S s;
+  // This should not crash clang.
+  s.~S();
+}
diff --git a/test/SemaCUDA/cuda-builtin-vars.cu b/test/SemaCUDA/cuda-builtin-vars.cu
index 97c5111cebdc6..108e75cae7667 100644
--- a/test/SemaCUDA/cuda-builtin-vars.cu
+++ b/test/SemaCUDA/cuda-builtin-vars.cu
@@ -34,20 +34,20 @@ void kernel(int *out) {
 
   out[i++] = warpSize;
   warpSize = 0; // expected-error {{cannot assign to variable 'warpSize' with const-qualified type 'const int'}}
-  // expected-note@cuda_builtin_vars.h:104 {{variable 'warpSize' declared const here}}
+  // expected-note@cuda_builtin_vars.h:* {{variable 'warpSize' declared const here}}
 
   // Make sure we can't construct or assign to the special variables.
   __cuda_builtin_threadIdx_t x; // expected-error {{calling a private constructor of class '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   __cuda_builtin_threadIdx_t y = threadIdx; // expected-error {{calling a private constructor of class '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   threadIdx = threadIdx; // expected-error {{'operator=' is a private member of '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   void *ptr = &threadIdx; // expected-error {{'operator&' is a private member of '__cuda_builtin_threadIdx_t'}}
-  // expected-note@cuda_builtin_vars.h:67 {{declared private here}}
+  // expected-note@cuda_builtin_vars.h:* {{declared private here}}
 
   // Following line should've caused an error as one is not allowed to
   // take address of a built-in variable in CUDA. Alas there's no way
diff --git a/test/SemaCUDA/cxx11-kernel-call.cu b/test/SemaCUDA/cxx11-kernel-call.cu
new file mode 100644
index 0000000000000..0d801256330c2
--- /dev/null
+++ b/test/SemaCUDA/cxx11-kernel-call.cu
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+#include "Inputs/cuda.h"
+
+__global__ void k1() {}
+
+template<int ...Dimensions> void k1Wrapper() {
+  void (*f)() = [] { k1<<<Dimensions, Dimensions>>>(); }; // expected-error {{initializer contains unexpanded parameter pack 'Dimensions'}}
+  void (*g[])() = { [] { k1<<<Dimensions, Dimensions>>>(); } ... }; // ok
+}
diff --git a/test/SemaCUDA/device-var-init.cu b/test/SemaCUDA/device-var-init.cu
new file mode 100644
index 0000000000000..d807a51b65357
--- /dev/null
+++ b/test/SemaCUDA/device-var-init.cu
@@ -0,0 +1,215 @@
+// REQUIRES: nvptx-registered-target
+
+// Make sure we don't allow dynamic initialization for device
+// variables, but accept empty constructors allowed by CUDA.
+
+// RUN: %clang_cc1 -verify %s -triple nvptx64-nvidia-cuda -fcuda-is-device -std=c++11 %s
+
+#ifdef __clang__
+#include "Inputs/cuda.h"
+#endif
+
+// Use the types we share with CodeGen tests.
+#include "Inputs/cuda-initializers.h"
+
+__shared__ int s_v_i = 1;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ int d_v_f = f();
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ int s_v_f = f();
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ int c_v_f = f();
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__shared__ T s_t_i = {2};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ EC d_ec_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC s_ec_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC c_ec_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ EC d_ec_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC s_ec_i2 = {3};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC c_ec_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ETC d_etc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ETC s_etc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ETC c_etc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ETC d_etc_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ETC s_etc_i2 = {3};
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ETC c_etc_i2 = {3};
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UC d_uc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UC s_uc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UC c_uc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UD d_ud;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UD s_ud;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UD c_ud;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ ECI d_eci;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ ECI s_eci;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ ECI c_eci;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NEC d_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NEC s_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NEC c_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NED d_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NED s_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NED c_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NCV d_ncv;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NCV s_ncv;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NCV c_ncv;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ VD d_vd;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ VD s_vd;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ VD c_vd;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NCF d_ncf;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NCF s_ncf;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NCF c_ncf;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__shared__ NCFS s_ncfs;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+__device__ UTC d_utc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UTC s_utc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UTC c_utc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ UTC d_utc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ UTC s_utc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ UTC c_utc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NETC d_netc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NETC s_netc;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NETC c_netc;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ NETC d_netc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ NETC s_netc_i(3);
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ NETC c_netc_i(3);
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ EC_I_EC1 d_ec_i_ec1;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ EC_I_EC1 s_ec_i_ec1;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ EC_I_EC1 c_ec_i_ec1;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_V_T d_t_v_t;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_V_T s_t_v_t;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_V_T c_t_v_t;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_B_NEC d_t_b_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_B_NEC s_t_b_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_B_NEC c_t_b_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_F_NEC d_t_f_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_F_NEC s_t_f_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_F_NEC c_t_f_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_FA_NEC d_t_fa_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_FA_NEC s_t_fa_nec;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_FA_NEC c_t_fa_nec;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_B_NED d_t_b_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_B_NED s_t_b_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_B_NED c_t_b_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_F_NED d_t_f_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_F_NED s_t_f_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_F_NED c_t_f_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+__device__ T_FA_NED d_t_fa_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+__shared__ T_FA_NED s_t_fa_ned;
+// expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+__constant__ T_FA_NED c_t_fa_ned;
+// expected-error@-1 {{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}}
+
+// Verify that only __shared__ local variables may be static on device
+// side and that they are not allowed to be initialized.
+__device__ void df_sema() {
+  static __shared__ NCFS s_ncfs;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  static __shared__ UC s_uc;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+  static __shared__ NED s_ned;
+  // expected-error@-1 {{initialization is not supported for __shared__ variables.}}
+
+  static __device__ int ds;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+  static __constant__ int dc;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+  static int v;
+  // expected-error@-1 {{Within a __device__/__global__ function, only __shared__ variables may be marked "static"}}
+}
diff --git a/test/SemaCUDA/function-overload.cu b/test/SemaCUDA/function-overload.cu
index bd3fb508bfab8..3c78600b174e3 100644
--- a/test/SemaCUDA/function-overload.cu
+++ b/test/SemaCUDA/function-overload.cu
@@ -1,237 +1,206 @@
 // REQUIRES: x86-registered-target
 // REQUIRES: nvptx-registered-target
 
-// Make sure we handle target overloads correctly.
-// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
-// RUN:    -fsyntax-only -fcuda-target-overloads -verify %s
-// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda \
-// RUN:    -fsyntax-only -fcuda-target-overloads -fcuda-is-device -verify %s
-
-// Check target overloads handling with disabled call target checks.
-// RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -fsyntax-only \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads -verify %s
-// RUN: %clang_cc1 -DNOCHECKS -triple nvptx64-nvidia-cuda -fsyntax-only \
-// RUN:    -fcuda-disable-target-call-checks -fcuda-target-overloads \
-// RUN:    -fcuda-is-device -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
 
 #include "Inputs/cuda.h"
 
-typedef int (*fp_t)(void);
-typedef void (*gp_t)(void);
+// Opaque return types used to check that we pick the right overloads.
+struct HostReturnTy {};
+struct HostReturnTy2 {};
+struct DeviceReturnTy {};
+struct DeviceReturnTy2 {};
+struct HostDeviceReturnTy {};
+struct TemplateReturnTy {};
+
+typedef HostReturnTy (*HostFnPtr)();
+typedef DeviceReturnTy (*DeviceFnPtr)();
+typedef HostDeviceReturnTy (*HostDeviceFnPtr)();
+typedef void (*GlobalFnPtr)();  // __global__ functions must return void.
+
+// CurrentReturnTy is {HostReturnTy,DeviceReturnTy} during {host,device}
+// compilation.
+#ifdef __CUDA_ARCH__
+typedef DeviceReturnTy CurrentReturnTy;
+#else
+typedef HostReturnTy CurrentReturnTy;
+#endif
 
-// Host and unattributed functions can't be overloaded
-__host__ int hh(void) { return 1; } // expected-note {{previous definition is here}}
-int hh(void) { return 1; } // expected-error {{redefinition of 'hh'}}
+// CurrentFnPtr is a function pointer to a {host,device} function during
+// {host,device} compilation.
+typedef CurrentReturnTy (*CurrentFnPtr)();
 
-// H/D overloading is OK
-__host__ int dh(void) { return 2; }
-__device__ int dh(void) { return 2; }
+// Host and unattributed functions can't be overloaded.
+__host__ void hh() {} // expected-note {{previous definition is here}}
+void hh() {} // expected-error {{redefinition of 'hh'}}
+
+// H/D overloading is OK.
+__host__ HostReturnTy dh() { return HostReturnTy(); }
+__device__ DeviceReturnTy dh() { return DeviceReturnTy(); }
 
-// H/HD and D/HD are not allowed
-__host__ __device__ int hdh(void) { return 5; } // expected-note {{previous definition is here}}
-__host__ int hdh(void) { return 4; } // expected-error {{redefinition of 'hdh'}}
+// H/HD and D/HD are not allowed.
+__host__ __device__ int hdh() { return 0; } // expected-note {{previous definition is here}}
+__host__ int hdh() { return 0; }            // expected-error {{redefinition of 'hdh'}}
 
-__host__ int hhd(void) { return 4; } // expected-note {{previous definition is here}}
-__host__ __device__ int hhd(void) { return 5; } // expected-error {{redefinition of 'hhd'}}
+__host__ int hhd() { return 0; }            // expected-note {{previous definition is here}}
+__host__ __device__ int hhd() { return 0; } // expected-error {{redefinition of 'hhd'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
-__host__ __device__ int hdd(void) { return 7; } // expected-note {{previous definition is here}}
-__device__ int hdd(void) { return 6; } // expected-error {{redefinition of 'hdd'}}
+__host__ __device__ int hdd() { return 0; } // expected-note {{previous definition is here}}
+__device__ int hdd() { return 0; }          // expected-error {{redefinition of 'hdd'}}
 
-__device__ int dhd(void) { return 6; } // expected-note {{previous definition is here}}
-__host__ __device__ int dhd(void) { return 7; } // expected-error {{redefinition of 'dhd'}}
+__device__ int dhd() { return 0; }          // expected-note {{previous definition is here}}
+__host__ __device__ int dhd() { return 0; } // expected-error {{redefinition of 'dhd'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
-// Same tests for extern "C" functions
-extern "C" __host__ int chh(void) {return 11;} // expected-note {{previous definition is here}}
-extern "C" int chh(void) {return 11;} // expected-error {{redefinition of 'chh'}}
+// Same tests for extern "C" functions.
+extern "C" __host__ int chh() { return 0; } // expected-note {{previous definition is here}}
+extern "C" int chh() { return 0; }          // expected-error {{redefinition of 'chh'}}
 
-// H/D overloading is OK
-extern "C" __device__ int cdh(void) {return 10;}
-extern "C" __host__ int cdh(void) {return 11;}
+// H/D overloading is OK.
+extern "C" __device__ DeviceReturnTy cdh() { return DeviceReturnTy(); }
+extern "C" __host__ HostReturnTy cdh() { return HostReturnTy(); }
 
 // H/HD and D/HD overloading is not allowed.
-extern "C" __host__ __device__ int chhd1(void) {return 12;} // expected-note {{previous definition is here}}
-extern "C" __host__ int chhd1(void) {return 13;} // expected-error {{redefinition of 'chhd1'}}
+extern "C" __host__ __device__ int chhd1() { return 0; } // expected-note {{previous definition is here}}
+extern "C" __host__ int chhd1() { return 0; }            // expected-error {{redefinition of 'chhd1'}}
 
-extern "C" __host__ int chhd2(void) {return 13;} // expected-note {{previous definition is here}}
-extern "C" __host__ __device__ int chhd2(void) {return 12;} // expected-error {{redefinition of 'chhd2'}}
+extern "C" __host__ int chhd2() { return 0; }            // expected-note {{previous definition is here}}
+extern "C" __host__ __device__ int chhd2() { return 0; } // expected-error {{redefinition of 'chhd2'}}
 // expected-warning@-1 {{attribute declaration must precede definition}}
 // expected-note@-3 {{previous definition is here}}
 
 // Helper functions to verify calling restrictions.
-__device__ int d(void) { return 8; }
-__host__ int h(void) { return 9; }
-__global__ void g(void) {}
-extern "C" __device__ int cd(void) {return 10;}
-extern "C" __host__ int ch(void) {return 11;}
-
-__host__ void hostf(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __device__ function 'd' in __host__ function}}
-  // expected-note@65 {{'d' declared here}}
-  // expected-error@-4 {{reference to __device__ function 'cd' in __host__ function}}
-  // expected-note@68 {{'cd' declared here}}
-#endif
-  fp_t hp = h;
-  fp_t chp = ch;
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g;
-
-  d();
-  cd();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'd'}}
-  // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ function}}
-  // expected-error@-4 {{no matching function for call to 'cd'}}
-  // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ function}}
-#endif
-  h();
-  ch();
-  dh();
-  cdh();
+__device__ DeviceReturnTy d() { return DeviceReturnTy(); }
+// expected-note@-1 1+ {{'d' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
+
+__host__ HostReturnTy h() { return HostReturnTy(); }
+// expected-note@-1 1+ {{'h' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+
+__global__ void g() {}
+// expected-note@-1 1+ {{'g' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __global__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __global__ function from __global__ function}}
+
+extern "C" __device__ DeviceReturnTy cd() { return DeviceReturnTy(); }
+// expected-note@-1 1+ {{'cd' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __device__ function from __host__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
+
+extern "C" __host__ HostReturnTy ch() { return HostReturnTy(); }
+// expected-note@-1 1+ {{'ch' declared here}}
+// expected-note@-2 1+ {{candidate function not viable: call to __host__ function from __device__ function}}
+// expected-note@-3 0+ {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
+// expected-note@-4 1+ {{candidate function not viable: call to __host__ function from __global__ function}}
+
+__host__ void hostf() {
+  DeviceFnPtr fp_d = d;         // expected-error {{reference to __device__ function 'd' in __host__ function}}
+  DeviceReturnTy ret_d = d();   // expected-error {{no matching function for call to 'd'}}
+  DeviceFnPtr fp_cd = cd;       // expected-error {{reference to __device__ function 'cd' in __host__ function}}
+  DeviceReturnTy ret_cd = cd(); // expected-error {{no matching function for call to 'cd'}}
+
+  HostFnPtr fp_h = h;
+  HostReturnTy ret_h = h();
+  HostFnPtr fp_ch = ch;
+  HostReturnTy ret_ch = ch();
+
+  HostFnPtr fp_dh = dh;
+  HostReturnTy ret_dh = dh();
+  HostFnPtr fp_cdh = cdh;
+  HostReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g;
   g(); // expected-error {{call to global function g not configured}}
-  g<<<0,0>>>();
+  g<<<0, 0>>>();
 }
 
+__device__ void devicef() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
 
-__device__ void devicef(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __device__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __device__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
-               // expected-note@67 {{'g' declared here}}
-
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __device__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __device__ function}}
-#endif
-  dh();
-  cdh();
+  HostFnPtr fp_h = h;         // expected-error {{reference to __host__ function 'h' in __device__ function}}
+  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostFnPtr fp_ch = ch;       // expected-error {{reference to __host__ function 'ch' in __device__ function}}
+  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+
+  DeviceFnPtr fp_dh = dh;
+  DeviceReturnTy ret_dh = dh();
+  DeviceFnPtr fp_cdh = cdh;
+  DeviceReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g; // expected-error {{reference to __global__ function 'g' in __device__ function}}
   g(); // expected-error {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __device__ function}}
   g<<<0,0>>>(); // expected-error {{reference to __global__ function 'g' in __device__ function}}
-  // expected-note@67 {{'g' declared here}}
 }
 
-__global__ void globalf(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{reference to __host__ function 'h' in __global__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-4 {{reference to __host__ function 'ch' in __global__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
-               // expected-note@67 {{'g' declared here}}
-
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-  // expected-error@-3 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __global__ function}}
-  // expected-error@-4 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __global__ function}}
-#endif
-  dh();
-  cdh();
+__global__ void globalf() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
+
+  HostFnPtr fp_h = h;         // expected-error {{reference to __host__ function 'h' in __global__ function}}
+  HostReturnTy ret_h = h();   // expected-error {{no matching function for call to 'h'}}
+  HostFnPtr fp_ch = ch;       // expected-error {{reference to __host__ function 'ch' in __global__ function}}
+  HostReturnTy ret_ch = ch(); // expected-error {{no matching function for call to 'ch'}}
+
+  DeviceFnPtr fp_dh = dh;
+  DeviceReturnTy ret_dh = dh();
+  DeviceFnPtr fp_cdh = cdh;
+  DeviceReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g; // expected-error {{reference to __global__ function 'g' in __global__ function}}
   g(); // expected-error {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __global__ function}}
   g<<<0,0>>>(); // expected-error {{reference to __global__ function 'g' in __global__ function}}
-  // expected-note@67 {{'g' declared here}}
 }
 
-__host__ __device__ void hostdevicef(void) {
-  fp_t dp = d;
-  fp_t cdp = cd;
-  fp_t hp = h;
-  fp_t chp = ch;
-#if !defined(NOCHECKS)
-#if !defined(__CUDA_ARCH__)
-  // expected-error@-6 {{reference to __device__ function 'd' in __host__ __device__ function}}
-  // expected-note@65 {{'d' declared here}}
-  // expected-error@-7 {{reference to __device__ function 'cd' in __host__ __device__ function}}
-  // expected-note@68 {{'cd' declared here}}
-#else
-  // expected-error@-9 {{reference to __host__ function 'h' in __host__ __device__ function}}
-  // expected-note@66 {{'h' declared here}}
-  // expected-error@-10 {{reference to __host__ function 'ch' in __host__ __device__ function}}
-  // expected-note@69 {{'ch' declared here}}
-#endif
-#endif
-  fp_t dhp = dh;
-  fp_t cdhp = cdh;
-  gp_t gp = g;
+__host__ __device__ void hostdevicef() {
+  DeviceFnPtr fp_d = d;
+  DeviceReturnTy ret_d = d();
+  DeviceFnPtr fp_cd = cd;
+  DeviceReturnTy ret_cd = cd();
+
+  HostFnPtr fp_h = h;
+  HostReturnTy ret_h = h();
+  HostFnPtr fp_ch = ch;
+  HostReturnTy ret_ch = ch();
+
+  CurrentFnPtr fp_dh = dh;
+  CurrentReturnTy ret_dh = dh();
+  CurrentFnPtr fp_cdh = cdh;
+  CurrentReturnTy ret_cdh = cdh();
+
+  GlobalFnPtr fp_g = g;
 #if defined(__CUDA_ARCH__)
   // expected-error@-2 {{reference to __global__ function 'g' in __host__ __device__ function}}
-  // expected-note@67 {{'g' declared here}}
-#endif
-
-  d();
-  cd();
-  h();
-  ch();
-#if !defined(NOCHECKS)
-#if !defined(__CUDA_ARCH__)
-  // expected-error@-6 {{no matching function for call to 'd'}}
-  // expected-note@65 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-  // expected-error@-7 {{no matching function for call to 'cd'}}
-  // expected-note@68 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#else
-  // expected-error@-9 {{no matching function for call to 'h'}}
-  // expected-note@66 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-  // expected-error@-10 {{no matching function for call to 'ch'}}
-  // expected-note@69 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
 #endif
-
-  dh();
-  cdh();
   g();
   g<<<0,0>>>();
 #if !defined(__CUDA_ARCH__)
   // expected-error@-3 {{call to global function g not configured}}
 #else
   // expected-error@-5 {{no matching function for call to 'g'}}
-  // expected-note@67 {{candidate function not viable: call to __global__ function from __host__ __device__ function}}
-  // expected-error@-6 {{reference to __global__ function 'g' in __host__ __device__ function}}
-  // expected-note@67 {{'g' declared here}}
+  // expected-error@-5 {{reference to __global__ function 'g' in __host__ __device__ function}}
 #endif  // __CUDA_ARCH__
 }
 
 // Test for address of overloaded function resolution in the global context.
-fp_t hp = h;
-fp_t chp = ch;
-fp_t dhp = dh;
-fp_t cdhp = cdh;
-gp_t gp = g;
+HostFnPtr fp_h = h;
+HostFnPtr fp_ch = ch;
+CurrentFnPtr fp_dh = dh;
+CurrentFnPtr fp_cdh = cdh;
+GlobalFnPtr fp_g = g;
 
 
 // Test overloading of destructors
@@ -315,3 +284,98 @@ struct m_hdd {
   __host__ __device__ void operator delete(void *ptr) {} // expected-note {{previous declaration is here}}
   __device__ void operator delete(void *ptr) {} // expected-error {{class member cannot be redeclared}}
 };
+
+// __global__ functions can't be overloaded based on attribute
+// difference.
+struct G {
+  friend void friend_of_g(G &arg);
+private:
+  int x;
+};
+__global__ void friend_of_g(G &arg) { int x = arg.x; } // expected-note {{previous definition is here}}
+void friend_of_g(G &arg) { int x = arg.x; } // expected-error {{redefinition of 'friend_of_g'}}
+
+// HD functions are sometimes allowed to call H or D functions -- this
+// is an artifact of the source-to-source splitting performed by nvcc
+// that we need to mimic. During device mode compilation in nvcc, host
+// functions aren't present at all, so don't participate in
+// overloading. But in clang, H and D functions are present in both
+// compilation modes. Clang normally uses the target attribute as a
+// tiebreaker between overloads with otherwise identical priority, but
+// in order to match nvcc's behavior, we sometimes need to wholly
+// discard overloads that would not be present during compilation
+// under nvcc.
+
+template <typename T> TemplateReturnTy template_vs_function(T arg) {
+  return TemplateReturnTy();
+}
+__device__ DeviceReturnTy template_vs_function(float arg) {
+  return DeviceReturnTy();
+}
+
+// Here we expect to call the templated function during host compilation, even
+// if -fcuda-disable-target-call-checks is passed, and even though C++ overload
+// rules prefer the non-templated function.
+__host__ __device__ void test_host_device_calls_template(void) {
+#ifdef __CUDA_ARCH__
+  typedef DeviceReturnTy ExpectedReturnTy;
+#else
+  typedef TemplateReturnTy ExpectedReturnTy;
+#endif
+
+  ExpectedReturnTy ret1 = template_vs_function(1.0f);
+  ExpectedReturnTy ret2 = template_vs_function(2.0);
+}
+
+// Calls from __host__ and __device__ functions should always call the
+// overloaded function that matches their mode.
+__host__ void test_host_calls_template_fn() {
+  TemplateReturnTy ret1 = template_vs_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_function(2.0);
+}
+
+__device__ void test_device_calls_template_fn() {
+  DeviceReturnTy ret1 = template_vs_function(1.0f);
+  DeviceReturnTy ret2 = template_vs_function(2.0);
+}
+
+// If we have a mix of HD and H-only or D-only candidates in the overload set,
+// normal C++ overload resolution rules apply first.
+template <typename T> TemplateReturnTy template_vs_hd_function(T arg) {
+  return TemplateReturnTy();
+}
+__host__ __device__ HostDeviceReturnTy template_vs_hd_function(float arg) {
+  return HostDeviceReturnTy();
+}
+
+__host__ __device__ void test_host_device_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_hd_function(1);
+}
+
+__host__ void test_host_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  TemplateReturnTy ret2 = template_vs_hd_function(1);
+}
+
+__device__ void test_device_calls_hd_template() {
+  HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f);
+  // Host-only function template is not callable with strict call checks,
+  // so for device side HD function will be the only choice.
+  HostDeviceReturnTy ret2 = template_vs_hd_function(1);
+}
+
+// Check that overloads still work the same way on both host and
+// device side when the overload set contains only functions from one
+// side of compilation.
+__device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); }
+__device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); }
+__host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); }
+__host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); }
+
+__host__ __device__ void test_host_device_single_side_overloading() {
+  DeviceReturnTy ret1 = device_only_function(1);
+  DeviceReturnTy2 ret2 = device_only_function(1.0f);
+  HostReturnTy ret3 = host_only_function(1);
+  HostReturnTy2 ret4 = host_only_function(1.0f);
+}
diff --git a/test/SemaCUDA/function-target-disabled-check.cu b/test/SemaCUDA/function-target-disabled-check.cu
deleted file mode 100644
index 979d4edbf8926..0000000000000
--- a/test/SemaCUDA/function-target-disabled-check.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-// Test that we can disable cross-target call checks in Sema with the
-// -fcuda-disable-target-call-checks flag. Without this flag we'd get a bunch
-// of errors here, since there are invalid cross-target calls present.
-
-// RUN: %clang_cc1 -fsyntax-only -verify %s -fcuda-disable-target-call-checks
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s -fcuda-disable-target-call-checks
-
-// expected-no-diagnostics
-
-#define __device__ __attribute__((device))
-#define __global__ __attribute__((global))
-#define __host__ __attribute__((host))
-
-__attribute__((host)) void h1();
-
-__attribute__((device)) void d1() {
-  h1();
-}
-
-__attribute__((host)) void h2() {
-  d1();
-}
-
-__attribute__((global)) void g1() {
-  h2();
-}
diff --git a/test/SemaCUDA/function-target-hd.cu b/test/SemaCUDA/function-target-hd.cu
deleted file mode 100644
index 685f4f9cda628..0000000000000
--- a/test/SemaCUDA/function-target-hd.cu
+++ /dev/null
@@ -1,71 +0,0 @@
-// Test the Sema analysis of caller-callee relationships of host device
-// functions when compiling CUDA code. There are 4 permutations of this test as
-// host and device compilation are separate compilation passes, and clang has
-// an option to allow host calls from host device functions. __CUDA_ARCH__ is
-// defined when compiling for the device and TEST_WARN_HD when host calls are
-// allowed from host device functions. So for example, if __CUDA_ARCH__ is
-// defined and TEST_WARN_HD is not then device compilation is happening but
-// host device functions are not allowed to call device functions.
-
-// RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -triple nvptx-unknown-cuda -fcuda-allow-host-calls-from-host-device -verify %s -DTEST_WARN_HD
-
-#include "Inputs/cuda.h"
-
-__host__ void hd1h(void);
-#if defined(__CUDA_ARCH__) && !defined(TEST_WARN_HD)
-// expected-note@-2 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
-__device__ void hd1d(void);
-#ifndef __CUDA_ARCH__
-// expected-note@-2 {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#endif
-__host__ void hd1hg(void);
-__device__ void hd1dg(void);
-#ifdef __CUDA_ARCH__
-__host__ void hd1hig(void);
-#if !defined(TEST_WARN_HD)
-// expected-note@-2 {{candidate function not viable: call to __host__ function from __host__ __device__ function}}
-#endif
-#else
-__device__ void hd1dig(void); // expected-note {{candidate function not viable: call to __device__ function from __host__ __device__ function}}
-#endif
-__host__ __device__ void hd1hd(void);
-__global__ void hd1g(void); // expected-note {{'hd1g' declared here}}
-
-__host__ __device__ void hd1(void) {
-#if defined(TEST_WARN_HD) && defined(__CUDA_ARCH__)
-// expected-warning@-2 {{calling __host__ function hd1h from __host__ __device__ function hd1}}
-// expected-warning@-3 {{calling __host__ function hd1hig from __host__ __device__ function hd1}}
-#endif
-  hd1d();
-#ifndef __CUDA_ARCH__
-// expected-error@-2 {{no matching function}}
-#endif
-  hd1h();
-#if defined(__CUDA_ARCH__) && !defined(TEST_WARN_HD)
-// expected-error@-2 {{no matching function}}
-#endif
-
-  // No errors as guarded
-#ifdef __CUDA_ARCH__
-  hd1d();
-#else
-  hd1h();
-#endif
-
-  // Errors as incorrectly guarded
-#ifndef __CUDA_ARCH__
-  hd1dig(); // expected-error {{no matching function}}
-#else
-  hd1hig();
-#ifndef TEST_WARN_HD
-// expected-error@-2 {{no matching function}}
-#endif
-#endif
-
-  hd1hd();
-  hd1g<<<1, 1>>>(); // expected-error {{reference to __global__ function 'hd1g' in __host__ __device__ function}}
-}
diff --git a/test/SemaCUDA/host-device-constexpr.cu b/test/SemaCUDA/host-device-constexpr.cu
new file mode 100644
index 0000000000000..6625d722c194a
--- /dev/null
+++ b/test/SemaCUDA/host-device-constexpr.cu
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -isystem %S/Inputs %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -isystem %S/Inputs %s -fcuda-is-device
+
+#include "Inputs/cuda.h"
+
+// Declares one function and pulls it into namespace ns:
+//
+//   __device__ int OverloadMe();
+//   namespace ns { using ::OverloadMe; }
+//
+// Clang cares that this is done in a system header.
+#include <overload.h>
+
+// Opaque type used to determine which overload we're invoking.
+struct HostReturnTy {};
+
+// These shouldn't become host+device because they already have attributes.
+__host__ constexpr int HostOnly() { return 0; }
+// expected-note@-1 0+ {{not viable}}
+__device__ constexpr int DeviceOnly() { return 0; }
+// expected-note@-1 0+ {{not viable}}
+
+constexpr int HostDevice() { return 0; }
+
+// This should be a host-only function, because there's a previous __device__
+// overload in <overload.h>.
+constexpr HostReturnTy OverloadMe() { return HostReturnTy(); }
+
+namespace ns {
+// The "using" statement in overload.h should prevent OverloadMe from being
+// implicitly host+device.
+constexpr HostReturnTy OverloadMe() { return HostReturnTy(); }
+}  // namespace ns
+
+// This is an error, because NonSysHdrOverload was not defined in a system
+// header.
+__device__ int NonSysHdrOverload() { return 0; }
+// expected-note@-1 {{conflicting __device__ function declared here}}
+constexpr int NonSysHdrOverload() { return 0; }
+// expected-error@-1 {{constexpr function 'NonSysHdrOverload' without __host__ or __device__ attributes}}
+
+// Variadic device functions are not allowed, so this is just treated as
+// host-only.
+constexpr void Variadic(const char*, ...);
+// expected-note@-1 {{call to __host__ function from __device__ function}}
+
+__host__ void HostFn() {
+  HostOnly();
+  DeviceOnly(); // expected-error {{no matching function}}
+  HostReturnTy x = OverloadMe();
+  HostReturnTy y = ns::OverloadMe();
+  Variadic("abc", 42);
+}
+
+__device__ void DeviceFn() {
+  HostOnly(); // expected-error {{no matching function}}
+  DeviceOnly();
+  int x = OverloadMe();
+  int y = ns::OverloadMe();
+  Variadic("abc", 42); // expected-error {{no matching function}}
+}
+
+__host__ __device__ void HostDeviceFn() {
+#ifdef __CUDA_ARCH__
+  int y = OverloadMe();
+#else
+  constexpr HostReturnTy y = OverloadMe();
+#endif
+}
diff --git a/test/SemaCUDA/implicit-intrinsic.cu b/test/SemaCUDA/implicit-intrinsic.cu
index 0793d64b1017a..dba26c553912e 100644
--- a/test/SemaCUDA/implicit-intrinsic.cu
+++ b/test/SemaCUDA/implicit-intrinsic.cu
@@ -1,7 +1,5 @@
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device \
 // RUN:     -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fcuda-is-device \
-// RUN:     -fcuda-target-overloads -fsyntax-only -verify %s
 
 #include "Inputs/cuda.h"
 
diff --git a/test/SemaCUDA/implicit-member-target-collision-cxx11.cu b/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
index f038c376ff3cb..7aa1dd3f20880 100644
--- a/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
+++ b/test/SemaCUDA/implicit-member-target-collision-cxx11.cu
@@ -74,13 +74,11 @@ struct B4_with_device_copy_ctor {
 struct C4_with_collision : A4_with_host_copy_ctor, B4_with_device_copy_ctor {
 };
 
-// expected-note@-3 {{candidate constructor (the implicit default constructor}} not viable
-// expected-note@-4 {{implicit copy constructor inferred target collision}}
-// expected-note@-5 {{candidate constructor (the implicit copy constructor}} not viable
+// expected-note@-3 {{copy constructor of 'C4_with_collision' is implicitly deleted because base class 'B4_with_device_copy_ctor' has no copy constructor}}
 
 void hostfoo4() {
   C4_with_collision c;
-  C4_with_collision c2 = c; // expected-error {{no matching constructor}}
+  C4_with_collision c2 = c; // expected-error {{call to implicitly-deleted copy constructor of 'C4_with_collision'}}
 }
 
 //------------------------------------------------------------------------------
diff --git a/test/SemaCUDA/implicit-member-target.cu b/test/SemaCUDA/implicit-member-target.cu
index 6064560f2c6fd..242d345fb93ec 100644
--- a/test/SemaCUDA/implicit-member-target.cu
+++ b/test/SemaCUDA/implicit-member-target.cu
@@ -60,13 +60,14 @@ struct A3_with_device_ctors {
 
 struct B3_with_implicit_ctors : A3_with_device_ctors {
 };
+// expected-note@-2 2{{call to __device__ function from __host__ function}}
+// expected-note@-3 {{default constructor}}
 
-// expected-note@-3 {{copy constructor of 'B3_with_implicit_ctors' is implicitly deleted}}
 
 void hostfoo3() {
   B3_with_implicit_ctors b;  // this is OK because the inferred default ctor
                              // here is __host__
-  B3_with_implicit_ctors b2 = b; // expected-error {{call to implicitly-deleted copy constructor}}
+  B3_with_implicit_ctors b2 = b; // expected-error {{no matching constructor}}
 
 }
 
diff --git a/test/SemaCUDA/kernel-call.cu b/test/SemaCUDA/kernel-call.cu
index 9a3d86c47fa07..47d7762f59ea1 100644
--- a/test/SemaCUDA/kernel-call.cu
+++ b/test/SemaCUDA/kernel-call.cu
@@ -23,4 +23,6 @@ int main(void) {
 
   int (*fp)(int) = h2;
   fp<<<1, 1>>>(42); // expected-error {{must have void return type}}
+
+  g1<<<undeclared, 1>>>(42); // expected-error {{use of undeclared identifier 'undeclared'}}
 }
diff --git a/test/SemaCUDA/method-target.cu b/test/SemaCUDA/method-target.cu
index 4fa290719cc1c..3fabfc359b7a6 100644
--- a/test/SemaCUDA/method-target.cu
+++ b/test/SemaCUDA/method-target.cu
@@ -44,7 +44,7 @@ struct S4 {
 };
 
 __host__ __device__ void foo4(S4& s) {
-  s.method(); // expected-error {{reference to __device__ function 'method' in __host__ __device__ function}}
+  s.method();
 }
 
 //------------------------------------------------------------------------------
diff --git a/test/SemaCUDA/no-host-device-constexpr.cu b/test/SemaCUDA/no-host-device-constexpr.cu
new file mode 100644
index 0000000000000..c70d97d61e47b
--- /dev/null
+++ b/test/SemaCUDA/no-host-device-constexpr.cu
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fno-cuda-host-device-constexpr -verify %s
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -fno-cuda-host-device-constexpr -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+// Check that, with -fno-cuda-host-device-constexpr, constexpr functions are
+// host-only, and __device__ constexpr functions are still device-only.
+
+constexpr int f() { return 0; } // expected-note {{not viable}}
+__device__ constexpr int g() { return 0; } // expected-note {{not viable}}
+
+void __device__ foo() {
+  f(); // expected-error {{no matching function}}
+  g();
+}
+
+void __host__ foo() {
+  f();
+  g(); // expected-error {{no matching function}}
+}
diff --git a/test/SemaCUDA/overloaded-delete.cu b/test/SemaCUDA/overloaded-delete.cu
new file mode 100644
index 0000000000000..e582fedb0aa88
--- /dev/null
+++ b/test/SemaCUDA/overloaded-delete.cu
@@ -0,0 +1,25 @@
+// expected-no-diagnostics
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s
+
+#include "Inputs/cuda.h"
+
+struct S {
+  __host__ static void operator delete(void*, size_t) {}
+  __device__ static void operator delete(void*, size_t) {}
+};
+
+__host__ __device__ void test(S* s) {
+  // This shouldn't be ambiguous -- we call the host overload in host mode and
+  // the device overload in device mode.
+  delete s;
+}
+
+__host__ void operator delete(void *ptr) {}
+__device__ void operator delete(void *ptr) {}
+
+__host__ __device__ void test_global_delete(int *ptr) {
+  // Again, there should be no ambiguity between which operator delete we call.
+  ::delete ptr;
+}
diff --git a/test/SemaCUDA/pr27778.cu b/test/SemaCUDA/pr27778.cu
new file mode 100644
index 0000000000000..101965bb98a26
--- /dev/null
+++ b/test/SemaCUDA/pr27778.cu
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only %s
+
+#include "Inputs/cuda.h"
+
+const int constint = 512;
+__launch_bounds__(constint) void TestConstInt(void) {}
diff --git a/test/SemaCUDA/vararg.cu b/test/SemaCUDA/vararg.cu
new file mode 100644
index 0000000000000..34ef367d89820
--- /dev/null
+++ b/test/SemaCUDA/vararg.cu
@@ -0,0 +1,57 @@
+// REQUIRES: x86-registered-target
+// REQUIRES: nvptx-registered-target
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
+// RUN:   -verify -DEXPECT_VA_ARG_ERR -DEXPECT_VARARG_ERR %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device -fsyntax-only \
+// RUN:   -fcuda-allow-variadic-functions -verify -DEXPECT_VA_ARG_ERR %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify \
+// RUN:   -DEXPECT_VARARG_ERR %s
+
+#include <stdarg.h>
+#include "Inputs/cuda.h"
+
+__device__ void foo() {
+  va_list list;
+  va_arg(list, int);
+#ifdef EXPECT_VA_ARG_ERR
+  // expected-error@-2 {{CUDA device code does not support va_arg}}
+#endif
+}
+
+void bar() {
+  va_list list;
+  va_arg(list, int);  // OK: host-only
+}
+
+__device__ void baz() {
+#if !defined(__CUDA_ARCH__)
+  va_list list;
+  va_arg(list, int);  // OK: only seen when compiling for host
+#endif
+}
+
+__device__ void vararg(const char* x, ...) {}
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+template <typename T>
+__device__ void vararg(T t, ...) {}
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+extern "C" __device__ int printf(const char* fmt, ...);  // OK, special case.
+
+// Definition of printf not allowed.
+extern "C" __device__ int printf(const char* fmt, ...) { return 0; }
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+
+namespace ns {
+__device__ int printf(const char* fmt, ...);
+#ifdef EXPECT_VARARG_ERR
+// expected-error@-2 {{CUDA device code does not support variadic functions}}
+#endif
+}
diff --git a/test/SemaCXX/MicrosoftExtensions.cpp b/test/SemaCXX/MicrosoftExtensions.cpp
index 22cf2be7c1ac9..e10deadf7c7f5 100644
--- a/test/SemaCXX/MicrosoftExtensions.cpp
+++ b/test/SemaCXX/MicrosoftExtensions.cpp
@@ -1,5 +1,7 @@
-// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fms-extensions -fexceptions -fcxx-exceptions
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fms-extensions -fexceptions -fcxx-exceptions -DTEST1
+// RUN: %clang_cc1 %s -triple i686-pc-win32 -fsyntax-only -Wmicrosoft -Wc++11-extensions -Wno-long-long -verify -fexceptions -fcxx-exceptions -DTEST2
 
+#if TEST1
 
 // Microsoft doesn't validate exception specification.
 namespace microsoft_exception_spec {
@@ -80,7 +82,73 @@ struct M {
 // __unaligned handling
 typedef char __unaligned *aligned_type;
 typedef struct UnalignedTag { int f; } __unaligned *aligned_type2;
+typedef char __unaligned aligned_type3;
 
+struct aligned_type4 {
+  int i;
+};
+
+__unaligned int aligned_type4::*p1_aligned_type4 = &aligned_type4::i;
+int aligned_type4::* __unaligned p2_aligned_type4 = &aligned_type4::i;
+__unaligned int aligned_type4::* __unaligned p3_aligned_type4 = &aligned_type4::i;
+void (aligned_type4::*__unaligned p4_aligned_type4)();
+
+// Check that __unaligned qualifier can be used for overloading
+void foo_unaligned(int *arg) {}
+void foo_unaligned(__unaligned int *arg) {}
+void foo_unaligned(int arg) {} // expected-note {{previous definition is here}}
+void foo_unaligned(__unaligned int arg) {} // expected-error {{redefinition of 'foo_unaligned'}}
+class A_unaligned {};
+class B_unaligned : public A_unaligned {};
+int foo_unaligned(__unaligned A_unaligned *arg) { return 0; }
+void *foo_unaligned(B_unaligned *arg) { return 0; }
+
+void test_unaligned() {
+  int *p1 = 0;
+  foo_unaligned(p1);
+
+  __unaligned int *p2 = 0;
+  foo_unaligned(p2);
+
+  __unaligned B_unaligned *p3 = 0;
+  int p4 = foo_unaligned(p3);
+
+  B_unaligned *p5 = p3; // expected-error {{cannot initialize a variable of type 'B_unaligned *' with an lvalue of type '__unaligned B_unaligned *'}}
+
+  __unaligned B_unaligned *p6 = p3;
+
+  p1_aligned_type4 = p2_aligned_type4;
+  p2_aligned_type4 = p1_aligned_type4; // expected-error {{assigning to 'int aligned_type4::*' from incompatible type '__unaligned int aligned_type4::*'}}
+  p3_aligned_type4 = p1_aligned_type4;
+
+  __unaligned int a[10];
+  int *b = a; // expected-error {{cannot initialize a variable of type 'int *' with an lvalue of type '__unaligned int [10]'}}
+}
+
+// Test from PR27367
+// We should accept assignment of an __unaligned pointer to a non-__unaligned
+// pointer to void
+typedef struct _ITEMIDLIST { int i; } ITEMIDLIST;
+typedef ITEMIDLIST __unaligned *LPITEMIDLIST;
+extern "C" __declspec(dllimport) void __stdcall CoTaskMemFree(void* pv);
+__inline void FreeIDListArray(LPITEMIDLIST *ppidls) {
+  CoTaskMemFree(*ppidls);
+  __unaligned int *x = 0;
+  void *y = x;
+}
+
+// Test from PR27666
+// We should accept type conversion of __unaligned to non-__unaligned references
+typedef struct in_addr {
+public:
+  in_addr(in_addr &a) {} // expected-note {{candidate constructor not viable: no known conversion from '__unaligned IN_ADDR *' (aka '__unaligned in_addr *') to 'in_addr &' for 1st argument; dereference the argument with *}}
+  in_addr(in_addr *a) {} // expected-note {{candidate constructor not viable: 1st argument ('__unaligned IN_ADDR *' (aka '__unaligned in_addr *')) would lose __unaligned qualifier}}
+} IN_ADDR;
+
+void f(IN_ADDR __unaligned *a) {
+  IN_ADDR local_addr = *a;
+  IN_ADDR local_addr2 = a; // expected-error {{no viable conversion from '__unaligned IN_ADDR *' (aka '__unaligned in_addr *') to 'IN_ADDR' (aka 'in_addr')}}
+}
 
 template<typename T> void h1(T (__stdcall M::* const )()) { }
 
@@ -420,3 +488,15 @@ struct S {
 
 int S::fn() { return 0; } // expected-warning {{is missing exception specification}}
 }
+
+#elif TEST2
+
+// Check that __unaligned is not recognized if MS extensions are not enabled
+typedef char __unaligned *aligned_type; // expected-error {{expected ';' after top level declarator}}
+
+#else
+
+#error Unknown test mode
+
+#endif
+
diff --git a/test/SemaCXX/PR10177.cpp b/test/SemaCXX/PR10177.cpp
index e361ff37bc032..9286e29351679 100644
--- a/test/SemaCXX/PR10177.cpp
+++ b/test/SemaCXX/PR10177.cpp
@@ -54,6 +54,7 @@ namespace N {
 
 namespace { template<typename> extern int n; }
 template<typename T> int g() { return n<int>; }
+namespace { extern template int n<int>; }
 
 #endif
 
diff --git a/test/SemaCXX/access.cpp b/test/SemaCXX/access.cpp
index 5fa1509c53021..29a58a1388acf 100644
--- a/test/SemaCXX/access.cpp
+++ b/test/SemaCXX/access.cpp
@@ -158,3 +158,14 @@ namespace LocalExternVar {
 
   int array[sizeof(test::private_struct)]; // expected-error {{private}}
 }
+
+namespace ThisLambdaIsNotMyFriend {
+  class A {
+    friend class D;
+    static void foo(); // expected-note {{here}}
+  };
+  template <class T> void foo() {
+    []() { A::foo(); }(); // expected-error {{private}}
+  }
+  void bar() { foo<void>(); }
+}
diff --git a/test/SemaCXX/aggregate-initialization.cpp b/test/SemaCXX/aggregate-initialization.cpp
index 4e4177463f885..ddaf33fc1bfad 100644
--- a/test/SemaCXX/aggregate-initialization.cpp
+++ b/test/SemaCXX/aggregate-initialization.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s 
 
 // Verify that using an initializer list for a non-aggregate looks for
 // constructors..
@@ -11,7 +13,7 @@ struct NonAggr1 { // expected-note 2 {{candidate constructor}}
 };
 
 struct Base { };
-struct NonAggr2 : public Base { // expected-note 3 {{candidate constructor}}
+struct NonAggr2 : public Base { // expected-note 0-3 {{candidate constructor}}
   int m;
 };
 
@@ -25,9 +27,15 @@ struct NonAggr4 { // expected-note 3 {{candidate constructor}}
 };
 
 NonAggr1 na1 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr1'}}
-NonAggr2 na2 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr2'}}
+NonAggr2 na2 = { 17 };
 NonAggr3 na3 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr3'}}
 NonAggr4 na4 = { 17 }; // expected-error{{no matching constructor for initialization of 'NonAggr4'}}
+#if __cplusplus <= 201402L
+// expected-error@-4{{no matching constructor for initialization of 'NonAggr2'}}
+#else
+// expected-error@-6{{requires explicit braces}}
+NonAggr2 na2b = { {}, 17 }; // ok
+#endif
 
 // PR5817
 typedef int type[][2];
@@ -82,3 +90,59 @@ public:
 };
 
 AggAgg aggagg = { 1, 2, 3, 4 };
+
+namespace diff_cpp14_dcl_init_aggr_example {
+  struct derived;
+  struct base {
+    friend struct derived;
+  private:
+    base();
+  };
+  struct derived : base {};
+
+  derived d1{};
+#if __cplusplus > 201402L
+  // expected-error@-2 {{private}}
+  // expected-note@-7 {{here}}
+#endif
+  derived d2;
+}
+
+namespace ProtectedBaseCtor {
+  // FIXME: It's unclear whether f() and g() should be valid in C++1z. What is
+  // the object expression in a constructor call -- the base class subobject or
+  // the complete object?
+  struct A {
+  protected:
+    A();
+  };
+
+  struct B : public A {
+    friend B f();
+    friend B g();
+    friend B h();
+  };
+
+  B f() { return {}; }
+#if __cplusplus > 201402L
+  // expected-error@-2 {{protected default constructor}}
+  // expected-note@-12 {{here}}
+#endif
+
+  B g() { return {{}}; }
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{no matching constructor}}
+  // expected-note@-15 3{{candidate}}
+#else
+  // expected-error@-5 {{protected default constructor}}
+  // expected-note@-21 {{here}}
+#endif
+
+  B h() { return {A{}}; }
+#if __cplusplus <= 201402L
+  // expected-error@-2 {{no matching constructor}}
+  // expected-note@-24 3{{candidate}}
+#endif
+  // expected-error@-5 {{protected constructor}}
+  // expected-note@-30 {{here}}
+}
diff --git a/test/SemaCXX/alias-template.cpp b/test/SemaCXX/alias-template.cpp
index bcfe428c69dd8..b6256103ef8d2 100644
--- a/test/SemaCXX/alias-template.cpp
+++ b/test/SemaCXX/alias-template.cpp
@@ -35,8 +35,8 @@ namespace VariableLengthArrays {
   template<typename Z> using T = int[n]; // expected-error {{variable length array declaration not allowed at file scope}}
 
   const int m = 42;
-  template<typename Z> using U = int[m]; // expected-note {{previous definition}}
-  template<typename Z> using U = int[42]; // ok
+  template<typename Z> using U = int[m];
+  template<typename Z> using U = int[42]; // expected-note {{previous definition}} 
   template<typename Z> using U = int; // expected-error {{type alias template redefinition with different types ('int' vs 'int [42]')}}
 }
 
diff --git a/test/SemaCXX/anonymous-struct.cpp b/test/SemaCXX/anonymous-struct.cpp
index 1b5dc13cea015..b584f89ff447a 100644
--- a/test/SemaCXX/anonymous-struct.cpp
+++ b/test/SemaCXX/anonymous-struct.cpp
@@ -1,7 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 struct S {
-  S();  // expected-note {{because type 'S' has a user-provided default constructor}}
+  S();
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{because type 'S' has a user-provided default constructor}}
+#endif
 };
 
 struct { // expected-error {{anonymous structs and classes must be class members}}
@@ -9,15 +14,25 @@ struct { // expected-error {{anonymous structs and classes must be class members
 
 struct E {
   struct {
-    S x;  // expected-error {{anonymous struct member 'x' has a non-trivial constructor}}
+    S x;
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{anonymous struct member 'x' has a non-trivial constructor}}
+#endif
   };
   static struct {
   };
 };
 
 template <class T> void foo(T);
-typedef struct { // expected-note {{use a tag name here to establish linkage prior to definition}} expected-note {{declared here}}
+typedef struct { // expected-note {{use a tag name here to establish linkage prior to definition}}
+#if __cplusplus <= 199711L
+// expected-note@-2 {{declared here}}
+#endif
+
   void test() {
-    foo(this); // expected-warning {{template argument uses unnamed type}}
+    foo(this);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses unnamed type}}
+#endif
   }
 } A; // expected-error {{unsupported: typedef changes linkage of anonymous type, but linkage was already computed}}
diff --git a/test/SemaCXX/ast-print.cpp b/test/SemaCXX/ast-print.cpp
index 39a52ab8d7e88..408af35c29519 100644
--- a/test/SemaCXX/ast-print.cpp
+++ b/test/SemaCXX/ast-print.cpp
@@ -227,3 +227,14 @@ template <typename T> struct Foo : T {
   using T::operator-;
 };
 }
+
+namespace dont_crash_on_auto_vars {
+struct T { enum E {X = 12ll }; };
+struct S {
+  struct  { int I; } ADecl;
+  static const auto Y = T::X;
+};
+//CHECK: static const auto Y = T::X;
+constexpr auto var = T::X;
+//CHECK: constexpr auto var = T::X;
+}
diff --git a/test/SemaCXX/atomic-ops.cpp b/test/SemaCXX/atomic-ops.cpp
new file mode 100644
index 0000000000000..213161364f585
--- /dev/null
+++ b/test/SemaCXX/atomic-ops.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -triple=i686-linux-gnu -std=c++11
+
+// We crashed when we couldn't properly convert the first arg of __atomic_* to
+// an lvalue.
+void PR28623() {
+  void helper(int); // expected-note{{target}}
+  void helper(char); // expected-note{{target}}
+  __atomic_store_n(helper, 0, 0); // expected-error{{reference to overloaded function could not be resolved}}
+}
diff --git a/test/SemaCXX/attr-abi-tag-syntax.cpp b/test/SemaCXX/attr-abi-tag-syntax.cpp
new file mode 100644
index 0000000000000..4f14a3c043b5c
--- /dev/null
+++ b/test/SemaCXX/attr-abi-tag-syntax.cpp
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+namespace N1 {
+
+namespace __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on non-inline namespace ignored}}
+
+namespace N __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on non-inline namespace ignored}}
+
+} // namespace N1
+
+namespace N2 {
+
+inline namespace __attribute__((__abi_tag__)) {}
+// expected-warning@-1 {{'abi_tag' attribute on anonymous namespace ignored}}
+
+inline namespace N __attribute__((__abi_tag__)) {}
+
+} // namespcace N2
+
+__attribute__((abi_tag("B", "A"))) extern int a1;
+
+__attribute__((abi_tag("A", "B"))) extern int a1;
+// expected-note@-1 {{previous declaration is here}}
+
+__attribute__((abi_tag("A", "C"))) extern int a1;
+// expected-error@-1 {{'abi_tag' C missing in original declaration}}
+
+extern int a2;
+// expected-note@-1 {{previous declaration is here}}
+__attribute__((abi_tag("A")))extern int a2;
+// expected-error@-1 {{cannot add 'abi_tag' attribute in a redeclaration}}
diff --git a/test/SemaCXX/attr-deprecated-replacement-error.cpp b/test/SemaCXX/attr-deprecated-replacement-error.cpp
new file mode 100644
index 0000000000000..54d0f9e74f340
--- /dev/null
+++ b/test/SemaCXX/attr-deprecated-replacement-error.cpp
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -verify -fsyntax-only -std=c++11 -fms-extensions %s
+
+#if !__has_feature(attribute_deprecated_with_replacement)
+#error "Missing __has_feature"
+#endif
+
+int a1 [[deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int a2 [[deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+
+int b1 [[gnu::deprecated("warning", "fixit")]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+int b2 [[gnu::deprecated("warning", 1)]]; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+
+__declspec(deprecated("warning", "fixit")) int c1; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+__declspec(deprecated("warning", 1)) int c2; // expected-error{{'deprecated' attribute takes no more than 1 argument}}
+
+int d1 __attribute__((deprecated("warning", "fixit")));
+int d2 __attribute__((deprecated("warning", 1))); // expected-error{{'deprecated' attribute requires a string}}
diff --git a/test/SemaCXX/attr-deprecated-replacement-fixit.cpp b/test/SemaCXX/attr-deprecated-replacement-fixit.cpp
new file mode 100644
index 0000000000000..2e635a72c1963
--- /dev/null
+++ b/test/SemaCXX/attr-deprecated-replacement-fixit.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -verify -fsyntax-only %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+// RUN: cp %s %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -fixit %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin9 -Werror %t
+
+#if !__has_feature(attribute_deprecated_with_replacement)
+#error "Missing __has_feature"
+#endif
+
+#if !__has_feature(attribute_availability_with_replacement)
+#error "Missing __has_feature"
+#endif
+
+void f_8(int) __attribute__((deprecated("message", "new8"))); // expected-note {{'f_8' has been explicitly marked deprecated here}}
+void new8(int);
+void f_2(int) __attribute__((availability(macosx,deprecated=9.0,replacement="new2"))); // expected-note {{'f_2' has been explicitly marked deprecated here}}
+void new2(int);
+void test() {
+  f_8(0); // expected-warning{{'f_8' is deprecated}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:6}:"new8"
+  f_2(0); // expected-warning{{'f_2' is deprecated: first deprecated in macOS 9.0}}
+  // CHECK: fix-it:"{{.*}}":{[[@LINE-1]]:3-[[@LINE-1]]:6}:"new2"
+}
diff --git a/test/SemaCXX/attr-lto-visibility-public.cpp b/test/SemaCXX/attr-lto-visibility-public.cpp
new file mode 100644
index 0000000000000..2f9ed87f6ee0a
--- /dev/null
+++ b/test/SemaCXX/attr-lto-visibility-public.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify %s
+
+int i [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+typedef int t [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+[[clang::lto_visibility_public]] void f(); // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+void f() [[clang::lto_visibility_public]]; // expected-error {{'lto_visibility_public' attribute cannot be applied to types}}
+
+struct [[clang::lto_visibility_public]] s1 {
+  int i [[clang::lto_visibility_public]]; // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+  [[clang::lto_visibility_public]] void f(); // expected-warning {{'lto_visibility_public' attribute only applies to struct, union or class}}
+};
+
+struct [[clang::lto_visibility_public(1)]] s2 { // expected-error {{'lto_visibility_public' attribute takes no arguments}}
+};
diff --git a/test/SemaCXX/attr-mode-tmpl.cpp b/test/SemaCXX/attr-mode-tmpl.cpp
new file mode 100644
index 0000000000000..4e1489a8a5bde
--- /dev/null
+++ b/test/SemaCXX/attr-mode-tmpl.cpp
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+typedef enum { XX } EnumType;
+struct S { int x; };
+
+// Check enumerations. Vector modes on enum types must cause an error.
+template <class T>
+void CheckEnumerations() {
+  // Check that non-vector 'mode' attribute is OK with enumeration types.
+  typedef T __attribute__((mode(QI))) T1;
+  typedef T T2 __attribute__((mode(HI)));
+  typedef T __attribute__((mode(V8SI))) T3; // expected-error{{mode 'V8SI' is not supported for enumeration types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+
+  typedef enum __attribute__((mode(HI))) { A4, B4 } T4;
+  typedef enum { A5, B5 } __attribute__((mode(SI))) T5;
+  typedef enum __attribute__((mode(V2SI))) { A6, B6 } T6; // expected-error{{mode 'V2SI' is not supported for enumeration types}}
+                                                          // expected-warning@-1{{deprecated}}
+  typedef enum { A7, B7 } __attribute__((mode(V2QI))) T7; // expected-error{{mode 'V2QI' is not supported for enumeration types}}
+                                                          // expected-warning@-1{{deprecated}}
+}
+
+// Check that attribute applies only for integer and floating-point types.
+// OK when instantiated with 'int', error with structure types, for example.
+template <class T>
+void CheckPrimitiveTypes() {
+  typedef T __attribute__((mode(QI))) T1;    // expected-error{{mode attribute only supported for integer and floating-point types}}
+  typedef T __attribute__((mode(V2SI))) VT1; // expected-error{{mode attribute only supported for integer and floating-point types}}
+  // expected-warning@-1{{specifying vector types with the 'mode' attribute is deprecated}}
+}
+
+// Check that attribute supports certain modes. Check that wrong machine modes
+// are NOT diagnosed twice during instantiation.
+template <class T>
+void CheckMachineMode() {
+  typedef T __attribute__((mode(QI))) T1; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(HI))) T2; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(SI))) T3; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(DI))) T4; // expected-error{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(SF))) T5; // expected-error2{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(DF))) T6; // expected-error2{{type of machine mode does not match type of base type}}
+  typedef T __attribute__((mode(II))) T7; // expected-error{{unknown machine mode}}
+  typedef T __attribute__((mode(12))) T8; // expected-error{{'mode' attribute requires an identifier}}
+}
+
+// Check attributes on function parameters.
+template <class T1, class T2>
+void CheckParameters(T1 __attribute__((mode(SI)))   paramSI,     // expected-note2{{ignored: substitution failure}}
+                     T1 __attribute__((mode(V4DI))) paramV4DI,   // expected-warning{{deprecated}}
+                     T2 __attribute__((mode(SF)))   paramSF,
+                     T2 __attribute__((mode(V4DF))) paramV4DF) { // expected-warning{{deprecated}}
+}
+
+
+// Check dependent structure.
+template <class T>
+struct TemplatedStruct {
+  // Check fields.
+  T __attribute__((mode(HI)))     x1;
+  T __attribute__((mode(V4HI)))   x2;         // expected-error{{mode 'V4HI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check typedefs.
+  typedef T __attribute__((mode(DI)))   T1;
+  typedef T __attribute__((mode(V8DI))) T2;   // expected-error{{mode 'V8DI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check parameters.
+  void f1(T __attribute__((mode(QI))) x) {}
+  void f2(T __attribute__((mode(SF))) x) {}   // expected-error2{{type of machine mode does not match type of base type}}
+  void f3(T __attribute__((mode(V4QI))) x) {} // expected-error{{mode 'V4QI' is not supported for enumeration types}}
+                                              // expected-warning@-1{{deprecated}}
+
+  // Check attribute on methods - it is invalid.
+  __attribute__((mode(QI))) T g1() { return 0; } // expected-error{{'mode' attribute only applies to variables, enums, fields and typedefs}}
+};
+
+
+
+int main() {
+  CheckEnumerations<int>();
+  CheckEnumerations<EnumType>(); // expected-note{{in instantiation of}}
+
+  CheckPrimitiveTypes<int>();
+  CheckPrimitiveTypes<S>();      // expected-note{{in instantiation of}}
+
+  // 'II' mode is unknown, no matter what we instantiate with.
+  CheckMachineMode<int>();       // expected-note{{in instantiation of}}
+  CheckMachineMode<EnumType>();  // expected-note{{in instantiation of}}
+  CheckMachineMode<float>();     // expected-note{{in instantiation of}}
+
+  int   __attribute__((mode(V4DI))) valV4DI; // expected-warning{{deprecated}}
+  float __attribute__((mode(V4DF))) valV4DF; // expected-warning{{deprecated}}
+  // OK.
+  CheckParameters<int, float>(0, valV4DI, 1.0, valV4DF);
+  // Enumeral type with vector mode is invalid.
+  CheckParameters<EnumType, float>(0, valV4DI, 1.0, valV4DF); // expected-error{{no matching function for call}}
+  // 'V4DF' mode with 'int' type is invalid.
+  CheckParameters<int, int>(0, valV4DI, 1, valV4DF); // expected-error{{no matching function for call}}
+
+  TemplatedStruct<int>      s1; // expected-note{{in instantiation of}}
+  TemplatedStruct<EnumType> s2; // expected-note{{in instantiation of}}
+  return 0;
+}
diff --git a/test/SemaCXX/attr-selectany.cpp b/test/SemaCXX/attr-selectany.cpp
index 058f2fcb84ab7..9dc14b3c38184 100644
--- a/test/SemaCXX/attr-selectany.cpp
+++ b/test/SemaCXX/attr-selectany.cpp
@@ -39,7 +39,9 @@ __declspec(selectany) auto x8 = Internal(); // expected-error {{'selectany' can
 // The D3D11 headers do something like this.  MSVC doesn't error on this at
 // all, even without the __declspec(selectany), in violation of the standard.
 // We fall back to a warning for selectany to accept headers.
-struct SomeStruct {};
+struct SomeStruct {
+  int foo;
+};
 extern const __declspec(selectany) SomeStruct some_struct; // expected-warning {{default initialization of an object of const type 'const SomeStruct' without a user-provided default constructor is a Microsoft extension}}
 
 // It should be possible to redeclare variables that were defined
diff --git a/test/SemaCXX/attr-swiftcall.cpp b/test/SemaCXX/attr-swiftcall.cpp
new file mode 100644
index 0000000000000..fd17aae185234
--- /dev/null
+++ b/test/SemaCXX/attr-swiftcall.cpp
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin10 -fsyntax-only -verify %s
+
+#define SWIFTCALL __attribute__((swiftcall))
+#define INDIRECT_RESULT __attribute__((swift_indirect_result))
+#define ERROR_RESULT __attribute__((swift_error_result))
+#define CONTEXT __attribute__((swift_context))
+
+int notAFunction SWIFTCALL; // expected-warning {{'swiftcall' only applies to function types; type here is 'int'}}
+void variadic(int x, ...) SWIFTCALL; // expected-error {{variadic function cannot use swiftcall calling convention}}
+void multiple_ccs(int x) SWIFTCALL __attribute__((vectorcall)); // expected-error {{vectorcall and swiftcall attributes are not compatible}}
+void (*functionPointer)(void) SWIFTCALL;
+
+void indirect_result_nonswift(INDIRECT_RESULT void *out); // expected-error {{'swift_indirect_result' parameter can only be used with swiftcall calling convention}}
+void indirect_result_bad_position(int first, INDIRECT_RESULT void *out) SWIFTCALL; // expected-error {{'swift_indirect_result' parameters must be first parameters of function}}
+void indirect_result_bad_type(INDIRECT_RESULT int out) SWIFTCALL; // expected-error {{'swift_indirect_result' parameter must have pointer type; type here is 'int'}}
+void indirect_result_single(INDIRECT_RESULT void *out) SWIFTCALL;
+void indirect_result_multiple(INDIRECT_RESULT void *out1, INDIRECT_RESULT void *out2) SWIFTCALL;
+
+void error_result_nonswift(ERROR_RESULT void **error); // expected-error {{'swift_error_result' parameter can only be used with swiftcall calling convention}} expected-error{{'swift_error_result' parameter must follow 'swift_context' parameter}}
+void error_result_bad_position(ERROR_RESULT void **error, int last) SWIFTCALL; // expected-error {{'swift_error_result' parameter must be last parameter of function}}
+void error_result_bad_position2(int first, ERROR_RESULT void **error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must follow 'swift_context' parameter}}
+void error_result_bad_type(CONTEXT void *context, ERROR_RESULT int error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must have pointer to unqualified pointer type; type here is 'int'}}
+void error_result_bad_type2(CONTEXT void *context, ERROR_RESULT int *error) SWIFTCALL; // expected-error {{'swift_error_result' parameter must have pointer to unqualified pointer type; type here is 'int *'}}
+void error_result_okay(int a, int b, CONTEXT void *context, ERROR_RESULT void **error) SWIFTCALL;
+
+void context_nonswift(CONTEXT void *context); // expected-error {{'swift_context' parameter can only be used with swiftcall calling convention}}
+void context_bad_position(CONTEXT void *context, int x) SWIFTCALL; // expected-error {{'swift_context' parameter can only be followed by 'swift_error_result' parameter}}
+void context_bad_type(CONTEXT int context) SWIFTCALL; // expected-error {{'swift_context' parameter must have pointer type; type here is 'int'}}
+void context_okay(CONTEXT void *context) SWIFTCALL;
+
+template <class T> void indirect_result_temp_okay1(INDIRECT_RESULT T *out) SWIFTCALL;
+template <class T> void indirect_result_temp_okay2(INDIRECT_RESULT T out) SWIFTCALL; // expected-note {{candidate template ignored: substitution failure [with T = int]: 'swift_indirect_result' parameter must have pointer type; type here is 'int'}}
+void test_indirect_result_temp(void *out) {
+  indirect_result_temp_okay1(out);
+  indirect_result_temp_okay2(out);
+  indirect_result_temp_okay2(1); // expected-error {{no matching function for call to 'indirect_result_temp_okay2'}}
+}
diff --git a/test/SemaCXX/attr-unavailable.cpp b/test/SemaCXX/attr-unavailable.cpp
index 51dc8fe3789ef..bafae2ab43a25 100644
--- a/test/SemaCXX/attr-unavailable.cpp
+++ b/test/SemaCXX/attr-unavailable.cpp
@@ -5,7 +5,8 @@ double &foo(double); // expected-note {{candidate}}
 void foo(...) __attribute__((__unavailable__)); // expected-note {{candidate function}} \
 // expected-note{{'foo' has been explicitly marked unavailable here}}
 
-void bar(...) __attribute__((__unavailable__)); // expected-note 2{{explicitly marked unavailable}}
+void bar(...) __attribute__((__unavailable__)); // expected-note 2{{explicitly marked unavailable}} \
+       // expected-note 2{{candidate function has been explicitly made unavailable}}
 
 void test_foo(short* sp) {
   int &ir = foo(1);
@@ -56,3 +57,85 @@ typedef enum UnavailableEnum AnotherUnavailableEnum; // expected-error {{'Unavai
 
 __attribute__((unavailable))
 UnavailableEnum testUnavailable(UnavailableEnum X) { return X; }
+
+
+// Check that unavailable classes can be used as arguments to unavailable
+// function, particularly in template functions.
+#if !__has_feature(attribute_availability_in_templates)
+#error "Missing __has_feature"
+#endif
+class __attribute((unavailable)) UnavailableClass; // \
+        expected-note 3{{'UnavailableClass' has been explicitly marked unavailable here}}
+void unavail_class(UnavailableClass&); // expected-error {{'UnavailableClass' is unavailable}}
+void unavail_class_marked(UnavailableClass&) __attribute__((unavailable));
+template <class T> void unavail_class(UnavailableClass&); // expected-error {{'UnavailableClass' is unavailable}}
+template <class T> void unavail_class_marked(UnavailableClass&) __attribute__((unavailable));
+template <class T> void templated(T&);
+void untemplated(UnavailableClass &UC) {  // expected-error {{'UnavailableClass' is unavailable}}
+  templated(UC);
+}
+void untemplated_marked(UnavailableClass &UC) __attribute__((unavailable)) {
+  templated(UC);
+}
+
+template <class T> void templated_calls_bar() { bar(); } // \
+           // expected-error{{call to unavailable function 'bar'}}
+template <class T> void templated_calls_bar_arg(T v) { bar(v); } // \
+           // expected-error{{call to unavailable function 'bar'}}
+template <class T> void templated_calls_bar_arg_never_called(T v) { bar(v); }
+
+template <class T>
+void unavail_templated_calls_bar() __attribute__((unavailable)) { // \
+  expected-note{{candidate function [with T = int] has been explicitly made unavailable}}
+  bar(5);
+}
+template <class T>
+void unavail_templated_calls_bar_arg(T v) __attribute__((unavailable)) { // \
+  expected-note{{candidate function [with T = int] has been explicitly made unavailable}}
+  bar(v);
+}
+
+void calls_templates_which_call_bar() {
+  templated_calls_bar<int>();
+
+  templated_calls_bar_arg(5); // \
+  expected-note{{in instantiation of function template specialization 'templated_calls_bar_arg<int>' requested here}}
+
+  unavail_templated_calls_bar<int>(); // \
+  expected-error{{call to unavailable function 'unavail_templated_calls_bar'}}
+
+  unavail_templated_calls_bar_arg(5); // \
+  expected-error{{call to unavailable function 'unavail_templated_calls_bar_arg'}}
+}
+
+template <class T> void unavail_templated(T) __attribute__((unavailable)); // \
+           expected-note{{candidate function [with T = int] has been explicitly made unavailable}}
+void calls_unavail_templated() {
+  unavail_templated(5); // expected-error{{call to unavailable function 'unavail_templated'}}
+}
+void unavail_calls_unavail_templated() __attribute__((unavailable)) {
+  unavail_templated(5);
+}
+
+void unavailable() __attribute((unavailable)); // \
+       expected-note 4{{candidate function has been explicitly made unavailable}}
+struct AvailableStruct {
+  void calls_unavailable() { unavailable(); } // \
+  expected-error{{call to unavailable function 'unavailable'}}
+  template <class U> void calls_unavailable() { unavailable(); } // \
+  expected-error{{call to unavailable function 'unavailable'}}
+};
+template <class T> struct AvailableStructTemplated {
+  void calls_unavailable() { unavailable(); } // \
+  expected-error{{call to unavailable function 'unavailable'}}
+  template <class U> void calls_unavailable() { unavailable(); } // \
+  expected-error{{call to unavailable function 'unavailable'}}
+};
+struct __attribute__((unavailable)) UnavailableStruct {
+  void calls_unavailable() { unavailable(); }
+  template <class U> void calls_unavailable() { unavailable(); }
+};
+template <class T> struct __attribute__((unavailable)) UnavailableStructTemplated {
+  void calls_unavailable() { unavailable(); }
+  template <class U> void calls_unavailable() { unavailable(); }
+};
diff --git a/test/SemaCXX/attr-x86-interrupt.cpp b/test/SemaCXX/attr-x86-interrupt.cpp
new file mode 100644
index 0000000000000..95abc7cd74db9
--- /dev/null
+++ b/test/SemaCXX/attr-x86-interrupt.cpp
@@ -0,0 +1,72 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-pc-win32  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i386-pc-win32  -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnux32  -fsyntax-only -verify %s
+
+struct a {
+  int b;
+  static void foo(int *a) __attribute__((interrupt)) {}
+  void *operator new(__SIZE_TYPE__) throw() __attribute__((interrupt)) { return 0; } // expected-warning {{'interrupt' attribute only applies to non-K&R-style functions}}
+};
+
+struct a test __attribute__((interrupt)); // expected-warning {{'interrupt' attribute only applies to non-K&R-style functions}}
+
+__attribute__((interrupt)) int foo1(void) { return 0; }             // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'void' return type}}
+__attribute__((interrupt)) void foo2(void) {}                       // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have only a pointer parameter optionally followed by an integer parameter}}
+__attribute__((interrupt)) void foo3(void *a, unsigned b, int c) {} // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have only a pointer parameter optionally followed by an integer parameter}}
+__attribute__((interrupt)) void foo4(int a) {}                      // expected-error-re {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a pointer as the first parameter}}
+#ifdef _LP64
+// expected-error-re@+6 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#else
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned int' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo5(void *a, float b) {}
+#ifdef _LP64
+// expected-error-re@+6 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#else
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned int' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo6(float *a, int b) {}
+
+#ifdef _LP64
+// expected-error-re@+4 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long' type as the second parameter}}
+#elif defined(__x86_64__)
+// expected-error-re@+2 {{{{(x86|x86-64)}} 'interrupt' attribute only applies to functions that have a 'unsigned long long' type as the second parameter}}
+#endif
+__attribute__((interrupt)) void foo7(int *a, unsigned b) {}
+__attribute__((interrupt)) void foo8(int *a) {}
+template<typename T>
+__attribute__((interrupt)) void foo9(T *a) {}
+
+template <typename T>
+void bar(T *a) {
+  foo9(a); // expected-error {{interrupt service routine cannot be called directly}}
+}
+
+template <typename Fn>
+void bar1(Fn F) {
+  F(0);
+}
+__attribute__((interrupt)) void foo(int *) {}
+
+void g(void (*fp)(int *));
+int main(int argc, char **argv) {
+  void *ptr = (void *)&foo7;
+  g(foo8);
+  (void)ptr;
+  a::foo(ptr); // expected-error {{interrupt service routine cannot be called directly}}
+  bar1(foo);
+#ifndef __x86_64__
+  // expected-error@+2 {{interrupt service routine cannot be called directly}}
+#endif
+  foo7((int *)argv, argc);
+  foo8((int *)argv);       // expected-error {{interrupt service routine cannot be called directly}}
+  bar(argv); // expected-note {{in instantiation of function template specialization 'bar<char *>' requested here}}
+  return 0;
+}
+
diff --git a/test/SemaCXX/builtin-classify-type.cpp b/test/SemaCXX/builtin-classify-type.cpp
new file mode 100644
index 0000000000000..f700c1d5baa9f
--- /dev/null
+++ b/test/SemaCXX/builtin-classify-type.cpp
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+enum gcc_type_class {
+  no_type_class = -1,
+  void_type_class, integer_type_class, char_type_class,
+  enumeral_type_class, boolean_type_class,
+  pointer_type_class, reference_type_class, offset_type_class,
+  real_type_class, complex_type_class,
+  function_type_class, method_type_class,
+  record_type_class, union_type_class,
+  array_type_class, string_type_class,
+  lang_type_class
+};
+
+class cl {
+public:
+    void bar() {}
+    int baz;
+};
+
+int builtin_result;
+
+void foo() {
+  int i;
+  char c;
+  enum { red, green, blue} enum_obj;
+  bool b;
+  int *p;
+  int &r = i;
+  double d;
+  extern void f();
+  cl cl_obj;
+  union { int a; float b; } u_obj;
+  int arr[10];
+
+  int a1[__builtin_classify_type(f()) == void_type_class ? 1 : -1];
+  int a2[__builtin_classify_type(i) == integer_type_class ? 1 : -1];
+  int a3[__builtin_classify_type(c) == integer_type_class ? 1 : -1];
+  int a4[__builtin_classify_type(enum_obj) == enumeral_type_class ? 1 : -1];
+  int a5[__builtin_classify_type(b) == boolean_type_class ? 1 : -1];
+  int a6[__builtin_classify_type(p) == pointer_type_class ? 1 : -1];
+  int a7[__builtin_classify_type(r) == integer_type_class ? 1 : -1];
+  int a8[__builtin_classify_type(&cl::baz) == offset_type_class ? 1 : -1];
+  int a9[__builtin_classify_type(d) == real_type_class ? 1 : -1];
+  int a10[__builtin_classify_type(f) == function_type_class ? 1 : -1];
+  int a11[__builtin_classify_type(&cl::bar) == method_type_class ? 1 : -1];
+  int a12[__builtin_classify_type(cl_obj) == record_type_class ? 1 : -1];
+  int a13[__builtin_classify_type(u_obj) == union_type_class ? 1 : -1];
+  int a14[__builtin_classify_type(arr) == array_type_class ? 1 : -1];
+  int a15[__builtin_classify_type("abc") == array_type_class ? 1 : -1];
+}
+
diff --git a/test/SemaCXX/builtin-object-size-cxx14.cpp b/test/SemaCXX/builtin-object-size-cxx14.cpp
new file mode 100644
index 0000000000000..32d752d273659
--- /dev/null
+++ b/test/SemaCXX/builtin-object-size-cxx14.cpp
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+
+namespace basic {
+// Ensuring that __bos can be used in constexpr functions without anything
+// sketchy going on...
+constexpr int bos0() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 0);
+}
+
+constexpr int bos1() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 1);
+}
+
+constexpr int bos2() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 2);
+}
+
+constexpr int bos3() {
+  int k = 5;
+  char cs[10] = {};
+  return __builtin_object_size(&cs[k], 3);
+}
+
+static_assert(bos0() == sizeof(char) * 5, "");
+static_assert(bos1() == sizeof(char) * 5, "");
+static_assert(bos2() == sizeof(char) * 5, "");
+static_assert(bos3() == sizeof(char) * 5, "");
+}
+
+namespace in_enable_if {
+// The code that prompted these changes was __bos in enable_if
+
+void copy5CharsInto(char *buf) // expected-note{{candidate}}
+    __attribute__((enable_if(__builtin_object_size(buf, 0) != -1 &&
+                                 __builtin_object_size(buf, 0) > 5,
+                             "")));
+
+// We use different EvalModes for __bos with type 0 versus 1. Ensure 1 works,
+// too...
+void copy5CharsIntoStrict(char *buf) // expected-note{{candidate}}
+    __attribute__((enable_if(__builtin_object_size(buf, 1) != -1 &&
+                                 __builtin_object_size(buf, 1) > 5,
+                             "")));
+
+struct LargeStruct {
+  int pad;
+  char buf[6];
+  int pad2;
+};
+
+struct SmallStruct {
+  int pad;
+  char buf[5];
+  int pad2;
+};
+
+void noWriteToBuf() {
+  char buf[6];
+  copy5CharsInto(buf);
+
+  LargeStruct large;
+  copy5CharsIntoStrict(large.buf);
+}
+
+void initTheBuf() {
+  char buf[6] = {};
+  copy5CharsInto(buf);
+
+  LargeStruct large = {0, {}, 0};
+  copy5CharsIntoStrict(large.buf);
+}
+
+int getI();
+void initTheBufWithALoop() {
+  char buf[6] = {};
+  for (unsigned I = getI(); I != sizeof(buf); ++I)
+    buf[I] = I;
+  copy5CharsInto(buf);
+
+  LargeStruct large;
+  for (unsigned I = getI(); I != sizeof(buf); ++I)
+    large.buf[I] = I;
+  copy5CharsIntoStrict(large.buf);
+}
+
+void tooSmallBuf() {
+  char buf[5];
+  copy5CharsInto(buf); // expected-error{{no matching function for call}}
+
+  SmallStruct small;
+  copy5CharsIntoStrict(small.buf); // expected-error{{no matching function for call}}
+}
+}
diff --git a/test/SemaCXX/c99-variable-length-array-cxx11.cpp b/test/SemaCXX/c99-variable-length-array-cxx11.cpp
index 03cf28388d98f..68858410ec769 100644
--- a/test/SemaCXX/c99-variable-length-array-cxx11.cpp
+++ b/test/SemaCXX/c99-variable-length-array-cxx11.cpp
@@ -22,5 +22,9 @@ void vla(int N) {
   POD array2[N]; // expected-warning{{variable length arrays are a C99 feature}}
   StillPOD array3[N]; // expected-warning{{variable length arrays are a C99 feature}}
   StillPOD2 array4[N][3]; // expected-warning{{variable length arrays are a C99 feature}}
-  NonPOD array5[N]; // expected-error{{variable length array of non-POD element type 'NonPOD'}}
+  NonPOD array5[N]; // expected-error{{no matching constructor for initialization of 'NonPOD [N]'}}
+  // expected-warning@-1{{variable length arrays are a C99 feature}}
+  // expected-note@-16{{candidate constructor not viable}}
+  // expected-note@-18{{candidate constructor (the implicit copy constructor) not viable}}
+  // expected-note@-19{{candidate constructor (the implicit move constructor) not viable}}
 }
diff --git a/test/SemaCXX/c99-variable-length-array.cpp b/test/SemaCXX/c99-variable-length-array.cpp
index 237f56458dab3..5fd7e3749f7d1 100644
--- a/test/SemaCXX/c99-variable-length-array.cpp
+++ b/test/SemaCXX/c99-variable-length-array.cpp
@@ -16,8 +16,8 @@ struct POD {
 void vla(int N) {
   int array1[N]; // expected-warning{{variable length arrays are a C99 feature}}
   POD array2[N]; // expected-warning{{variable length arrays are a C99 feature}}
-  NonPOD array3[N]; // expected-error{{variable length array of non-POD element type 'NonPOD'}}
-  NonPOD2 array4[N][3]; // expected-error{{variable length array of non-POD element type 'NonPOD2'}}
+  NonPOD array3[N]; // expected-warning{{variable length arrays are a C99 feature}}
+  NonPOD2 array4[N][3]; // expected-warning{{variable length arrays are a C99 feature}}
 }
 
 /// Warn about VLAs in templates.
diff --git a/test/SemaCXX/class.cpp b/test/SemaCXX/class.cpp
index a6694403a68ad..a3593689b5fbf 100644
--- a/test/SemaCXX/class.cpp
+++ b/test/SemaCXX/class.cpp
@@ -1,7 +1,12 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wc++11-compat %s 
 class C {
 public:
-  auto int errx; // expected-error {{storage class specified for a member declaration}} expected-warning {{'auto' storage class specifier is redundant}}
+  auto int errx; // expected-error {{storage class specified for a member declaration}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{'auto' storage class specifier is redundant}}
+#else
+  // expected-warning@-4 {{'auto' storage class specifier is not permitted in C++11, and will not be supported in future releases}}
+#endif
   register int erry; // expected-error {{storage class specified for a member declaration}}
   extern int errz; // expected-error {{storage class specified for a member declaration}}
 
@@ -36,12 +41,18 @@ public:
 
   enum E1 { en1, en2 };
 
-  int i = 0; // expected-warning {{in-class initialization of non-static data member is a C++11 extension}}
+  int i = 0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{in-class initialization of non-static data member is a C++11 extension}}
+#endif
   static int si = 0; // expected-error {{non-const static data member must be initialized out of line}}
   static const NestedC ci = 0; // expected-error {{static data member of type 'const C::NestedC' must be initialized out of line}}
   static const int nci = vs; // expected-error {{in-class initializer for static data member is not a constant expression}}
   static const int vi = 0;
   static const volatile int cvi = 0; // ok, illegal in C++11
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{static const volatile data member must be initialized out of line}}
+#endif
   static const E evi = 0;
 
   void m() {
@@ -169,10 +180,18 @@ namespace rdar8066414 {
 
 namespace rdar8367341 {
   float foo();
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{declared here}}
+#endif
 
   struct A {
+#if __cplusplus <= 199711L
     static const float x = 5.0f; // expected-warning {{in-class initializer for static data member of type 'const float' is a GNU extension}}
     static const float y = foo(); // expected-warning {{in-class initializer for static data member of type 'const float' is a GNU extension}} expected-error {{in-class initializer for static data member is not a constant expression}}
+#else
+    static constexpr float x = 5.0f;
+    static constexpr float y = foo(); // expected-error {{constexpr variable 'y' must be initialized by a constant expression}} expected-note {{non-constexpr function 'foo' cannot be used in a constant expression}}
+#endif
   };
 }
 
diff --git a/test/SemaCXX/condition.cpp b/test/SemaCXX/condition.cpp
index b757fcb8cd5c9..5596564d2481a 100644
--- a/test/SemaCXX/condition.cpp
+++ b/test/SemaCXX/condition.cpp
@@ -65,3 +65,7 @@ void test5() {
 void test5_inst() {
    test5<int>();
 }
+
+void PR28373() {
+  if (!x) {} // expected-error {{undeclared}}
+}
diff --git a/test/SemaCXX/conditional-expr.cpp b/test/SemaCXX/conditional-expr.cpp
index 538de5847de81..c12efc0be7ef8 100644
--- a/test/SemaCXX/conditional-expr.cpp
+++ b/test/SemaCXX/conditional-expr.cpp
@@ -384,3 +384,12 @@ namespace PR17052 {
     int &test() { return b_ ? i_ : throw 1; }
   };
 }
+
+namespace PR26448 {
+struct Base {};
+struct Derived : Base {};
+Base b;
+Derived d;
+typedef decltype(true ? static_cast<Base&&>(b) : static_cast<Derived&&>(d)) x;
+typedef Base &&x;
+}
diff --git a/test/SemaCXX/constant-expression-cxx11.cpp b/test/SemaCXX/constant-expression-cxx11.cpp
index 7b9d0150e1e82..e2b3f091f70f0 100644
--- a/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1181,6 +1181,20 @@ namespace ExternConstexpr {
     constexpr int j = 0;
     constexpr int k; // expected-error {{default initialization of an object of const type}}
   }
+
+  extern const int q;
+  constexpr int g() { return q; }
+  constexpr int q = g();
+  static_assert(q == 0, "zero-initialization should precede static initialization");
+
+  extern int r; // expected-note {{here}}
+  constexpr int h() { return r; } // expected-error {{never produces a constant}} expected-note {{read of non-const}}
+
+  struct S { int n; };
+  extern const S s;
+  constexpr int x() { return s.n; }
+  constexpr S s = {x()};
+  static_assert(s.n == 0, "zero-initialization should precede static initialization");
 }
 
 namespace ComplexConstexpr {
@@ -2005,3 +2019,50 @@ namespace PR24597 {
   constexpr int a = *f().p;
   constexpr int b = *g().p;
 }
+
+namespace IncompleteClass {
+  struct XX {
+    static constexpr int f(XX*) { return 1; } // expected-note {{here}}
+    friend constexpr int g(XX*) { return 2; } // expected-note {{here}}
+
+    static constexpr int i = f(static_cast<XX*>(nullptr)); // expected-error {{constexpr variable 'i' must be initialized by a constant expression}}  expected-note {{undefined function 'f' cannot be used in a constant expression}}
+    static constexpr int j = g(static_cast<XX*>(nullptr)); // expected-error {{constexpr variable 'j' must be initialized by a constant expression}}  expected-note {{undefined function 'g' cannot be used in a constant expression}}
+  };
+}
+
+namespace InheritedCtor {
+  struct A { constexpr A(int) {} };
+
+  struct B : A { int n; using A::A; }; // expected-note {{here}}
+  constexpr B b(0); // expected-error {{constant expression}} expected-note {{derived class}}
+
+  struct C : A { using A::A; struct { union { int n, m = 0; }; union { int a = 0; }; int k = 0; }; struct {}; union {}; }; // expected-warning 4{{extension}}
+  constexpr C c(0);
+
+  struct D : A {
+    using A::A; // expected-note {{here}}
+    struct { // expected-warning {{extension}}
+      union { // expected-warning {{extension}}
+        int n;
+      };
+    };
+  };
+  constexpr D d(0); // expected-error {{constant expression}} expected-note {{derived class}}
+
+  struct E : virtual A { using A::A; }; // expected-note {{here}}
+  // We wrap a function around this to avoid implicit zero-initialization
+  // happening first; the zero-initialization step would produce the same
+  // error and defeat the point of this test.
+  void f() {
+    constexpr E e(0); // expected-error {{constant expression}} expected-note {{derived class}}
+  }
+  // FIXME: This produces a note with no source location.
+  //constexpr E e(0);
+
+  struct W { constexpr W(int n) : w(n) {} int w; };
+  struct X : W { using W::W; int x = 2; };
+  struct Y : X { using X::X; int y = 3; };
+  struct Z : Y { using Y::Y; int z = 4; };
+  constexpr Z z(1);
+  static_assert(z.w == 1 && z.x == 2 && z.y == 3 && z.z == 4, "");
+}
diff --git a/test/SemaCXX/constant-expression-cxx1y.cpp b/test/SemaCXX/constant-expression-cxx1y.cpp
index e9ecbe83b3daa..f8103224af89b 100644
--- a/test/SemaCXX/constant-expression-cxx1y.cpp
+++ b/test/SemaCXX/constant-expression-cxx1y.cpp
@@ -179,12 +179,10 @@ namespace string_assign {
   static_assert(!test1(100), "");
   static_assert(!test1(101), ""); // expected-error {{constant expression}} expected-note {{in call to 'test1(101)'}}
 
-  // FIXME: We should be able to reject this before it's called
-  constexpr void f() {
+  constexpr void f() { // expected-error{{constexpr function never produces a constant expression}} expected-note@+2{{assignment to dereferenced one-past-the-end pointer is not allowed in a constant expression}}
     char foo[10] = { "z" }; // expected-note {{here}}
-    foo[10] = 'x'; // expected-warning {{past the end}} expected-note {{assignment to dereferenced one-past-the-end pointer}}
+    foo[10] = 'x'; // expected-warning {{past the end}}
   }
-  constexpr int k = (f(), 0); // expected-error {{constant expression}} expected-note {{in call}}
 }
 
 namespace array_resize {
@@ -938,3 +936,24 @@ namespace EmptyClass {
   constexpr int testb = f(e2, 3); // expected-error {{constant expression}} expected-note {{in call}}
   constexpr int testc = f(e3, 3);
 }
+
+namespace SpeculativeEvalWrites {
+  // Ensure that we don't try to speculatively evaluate writes.
+  constexpr int f() {
+    int i = 0;
+    int a = 0;
+    // __builtin_object_size speculatively evaluates its first argument.
+    __builtin_object_size((i = 1, &a), 0);
+    return i;
+  }
+
+  static_assert(!f(), "");
+}
+
+namespace PR27989 {
+  constexpr int f(int n) {
+    int a = (n = 1, 0);
+    return n;
+  }
+  static_assert(f(0) == 1, "");
+}
diff --git a/test/SemaCXX/constant-expression-cxx1z.cpp b/test/SemaCXX/constant-expression-cxx1z.cpp
new file mode 100644
index 0000000000000..e84de44d374a0
--- /dev/null
+++ b/test/SemaCXX/constant-expression-cxx1z.cpp
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s -fcxx-exceptions -triple=x86_64-linux-gnu
+
+namespace BaseClassAggregateInit {
+  struct A {
+    int a, b, c;
+    constexpr A(int n) : a(n), b(3 * n), c(b - 1) {} // expected-note {{outside the range of representable}}
+    constexpr A() : A(10) {};
+  };
+  struct B : A {};
+  struct C { int q; };
+  struct D : B, C { int k; };
+
+  constexpr D d1 = { 1, 2, 3 };
+  static_assert(d1.a == 1 && d1.b == 3 && d1.c == 2 && d1.q == 2 && d1.k == 3);
+
+  constexpr D d2 = { 14 };
+  static_assert(d2.a == 14 && d2.b == 42 && d2.c == 41 && d2.q == 0 && d2.k == 0);
+
+  constexpr D d3 = { A(5), C{2}, 1 };
+  static_assert(d3.a == 5 && d3.b == 15 && d3.c == 14 && d3.q == 2 && d3.k == 1);
+
+  constexpr D d4 = {};
+  static_assert(d4.a == 10 && d4.b == 30 && d4.c == 29 && d4.q == 0 && d4.k == 0);
+
+  constexpr D d5 = { __INT_MAX__ }; // expected-error {{must be initialized by a constant expression}}
+  // expected-note-re@-1 {{in call to 'A({{.*}})'}}
+}
diff --git a/test/SemaCXX/constexpr-nqueens.cpp b/test/SemaCXX/constexpr-nqueens.cpp
index b158d6e4b6e07..47133a2934340 100644
--- a/test/SemaCXX/constexpr-nqueens.cpp
+++ b/test/SemaCXX/constexpr-nqueens.cpp
@@ -10,26 +10,26 @@ struct Board {
   constexpr Board(const Board &O) : State(O.State), Failed(O.Failed) {}
   constexpr Board(uint64_t State, bool Failed = false) :
     State(State), Failed(Failed) {}
-  constexpr Board addQueen(int Row, int Col) {
+  constexpr Board addQueen(int Row, int Col) const {
     return Board(State | ((uint64_t)Row << (Col * 4)));
   }
-  constexpr int getQueenRow(int Col) {
+  constexpr int getQueenRow(int Col) const {
     return (State >> (Col * 4)) & 0xf;
   }
-  constexpr bool ok(int Row, int Col) {
+  constexpr bool ok(int Row, int Col) const {
     return okRecurse(Row, Col, 0);
   }
-  constexpr bool okRecurse(int Row, int Col, int CheckCol) {
+  constexpr bool okRecurse(int Row, int Col, int CheckCol) const {
     return Col == CheckCol ? true :
            getQueenRow(CheckCol) == Row ? false :
            getQueenRow(CheckCol) == Row + (Col - CheckCol) ? false :
            getQueenRow(CheckCol) == Row + (CheckCol - Col) ? false :
            okRecurse(Row, Col, CheckCol + 1);
   }
-  constexpr bool at(int Row, int Col) {
+  constexpr bool at(int Row, int Col) const {
     return getQueenRow(Col) == Row;
   }
-  constexpr bool check(const char *, int=0, int=0);
+  constexpr bool check(const char *, int=0, int=0) const;
 };
 
 constexpr Board buildBoardRecurse(int N, int Col, const Board &B);
@@ -54,7 +54,7 @@ constexpr Board buildBoard(int N) {
 
 constexpr Board q8 = buildBoard(8);
 
-constexpr bool Board::check(const char *p, int Row, int Col) {
+constexpr bool Board::check(const char *p, int Row, int Col) const {
   return
     *p == '\n' ? check(p+1, Row+1, 0) :
     *p == 'o' ? at(Row, Col) && check(p+1, Row, Col+1) :
diff --git a/test/SemaCXX/constexpr-value-init.cpp b/test/SemaCXX/constexpr-value-init.cpp
index 065111133551a..3528fdcd8816e 100644
--- a/test/SemaCXX/constexpr-value-init.cpp
+++ b/test/SemaCXX/constexpr-value-init.cpp
@@ -14,7 +14,7 @@ void f() {
   constexpr A a; // expected-error {{constant expression}} expected-note {{in call to 'A()'}}
 }
 
-constexpr B b1; // expected-error {{without a user-provided default constructor}}
+constexpr B b1; // ok
 constexpr B b2 = B(); // ok
 static_assert(b2.a.a == 1, "");
 static_assert(b2.a.b == 2, "");
diff --git a/test/SemaCXX/constructor-recovery.cpp b/test/SemaCXX/constructor-recovery.cpp
index c1bb43628356e..a0d8441012c57 100644
--- a/test/SemaCXX/constructor-recovery.cpp
+++ b/test/SemaCXX/constructor-recovery.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
 
-struct C {
+struct C { // expected-note 1+{{candidate}}
   virtual C() = 0; // expected-error{{constructor cannot be declared 'virtual'}}
 };
 
 void f() {
- C c;
+ C c; // expected-error {{no matching constructor}}
 }
diff --git a/test/SemaCXX/conversion-function.cpp b/test/SemaCXX/conversion-function.cpp
index 649f6b4dce946..3f494cce8ce3f 100644
--- a/test/SemaCXX/conversion-function.cpp
+++ b/test/SemaCXX/conversion-function.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify %s 
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify -std=c++98 %s
+// RUN: %clang_cc1 -triple %itanium_abi_triple -fsyntax-only -Wbind-to-temporary-copy -verify -std=c++11 %s
+
 class X { 
 public:
   operator bool();
@@ -133,7 +136,12 @@ private:
 
 A1 f() {
   // FIXME: redundant diagnostics!
-  return "Hello"; // expected-error {{calling a private constructor}} expected-warning {{an accessible copy constructor}}
+  return "Hello"; // expected-error {{calling a private constructor}}
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{an accessible copy constructor}}
+#else
+  // expected-warning@-4 {{copying parameter of type 'A1' when binding a reference to a temporary would invoke an inaccessible constructor in C++98}}
+#endif
 }
 
 namespace source_locations {
@@ -175,7 +183,13 @@ namespace crazy_declarators {
     (&operator bool())(); // expected-error {{use a typedef to declare a conversion to 'bool (&)()'}}
     *operator int();  // expected-error {{put the complete type after 'operator'}}
     // No suggestion of using a typedef here; that's not possible.
-    template<typename T> (&operator T())(); // expected-error-re {{cannot specify any part of a return type in the declaration of a conversion function{{$}}}}
+    template<typename T> (&operator T())();
+#if __cplusplus <= 199711L
+    // expected-error-re@-2 {{cannot specify any part of a return type in the declaration of a conversion function{{$}}}}
+#else
+    // expected-error-re@-4 {{cannot specify any part of a return type in the declaration of a conversion function; use an alias template to declare a conversion to 'T (&)()'{{$}}}}
+#endif
+
   };
 }
 
@@ -193,6 +207,10 @@ namespace smart_ptr {
   };
 
   struct X { // expected-note{{candidate constructor (the implicit copy constructor) not}}
+#if __cplusplus >= 201103L
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not}}
+#endif
+
     explicit X(Y);
   };
 
@@ -215,7 +233,12 @@ struct Other {
 };
 
 void test_any() {
-  Any any = Other(); // expected-error{{cannot pass object of non-POD type 'Other' through variadic constructor; call will abort at runtime}}
+  Any any = Other();
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{cannot pass object of non-POD type 'Other' through variadic constructor; call will abort at runtime}}
+#else
+  // expected-error@-4 {{cannot pass object of non-trivial type 'Other' through variadic constructor; call will abort at runtime}}
+#endif
 }
 
 namespace PR7055 {
diff --git a/test/SemaCXX/conversion.cpp b/test/SemaCXX/conversion.cpp
index 4c4089c6aae77..7b86cecdbcef7 100644
--- a/test/SemaCXX/conversion.cpp
+++ b/test/SemaCXX/conversion.cpp
@@ -228,3 +228,73 @@ namespace test10 {
     assert(test2(x));
   }
 }
+
+namespace test11 {
+
+#define assert11(expr) ((expr) ? 0 : 0)
+
+// The whitespace in macro run1 are important to trigger the macro being split
+// over multiple SLocEntry's.
+#define run1() (dostuff() ? \
+    NULL                                   : NULL)
+#define run2() (dostuff() ? NULL : NULL)
+int dostuff ();
+
+void test(const char * content_type) {
+  assert11(run1());
+  assert11(run2());
+}
+
+}
+
+namespace test12 {
+
+#define x return NULL;
+
+bool run() {
+  x  // expected-warning{{}}
+}
+
+}
+
+// More tests with macros.  Specficially, test function-like macros that either
+// have a pointer return type or take pointer arguments.  Basically, if the
+// macro was changed into a function and Clang doesn't warn, then it shouldn't
+// warn for the macro either.
+namespace test13 {
+#define check_str_nullptr_13(str) ((str) ? str : nullptr)
+#define check_str_null_13(str) ((str) ? str : NULL)
+#define test13(condition) if (condition) return;
+#define identity13(arg) arg
+#define CHECK13(condition) test13(identity13(!(condition)))
+
+void function1(const char* str) {
+  CHECK13(check_str_nullptr_13(str));
+  CHECK13(check_str_null_13(str));
+}
+
+bool some_bool_function(bool);
+void function2() {
+  CHECK13(some_bool_function(nullptr));  // expected-warning{{implicit conversion of nullptr constant to 'bool'}}
+  CHECK13(some_bool_function(NULL));  // expected-warning{{implicit conversion of NULL constant to 'bool'}}
+}
+
+#define run_check_nullptr_13(str) \
+    if (check_str_nullptr_13(str)) return;
+#define run_check_null_13(str) \
+    if (check_str_null_13(str)) return;
+void function3(const char* str) {
+  run_check_nullptr_13(str)
+  run_check_null_13(str)
+  if (check_str_nullptr_13(str)) return;
+  if (check_str_null_13(str)) return;
+}
+
+void run(int* ptr);
+#define conditional_run_13(ptr) \
+    if (ptr) run(ptr);
+void function4() {
+  conditional_run_13(nullptr);
+  conditional_run_13(NULL);
+}
+}
diff --git a/test/SemaCXX/crashes.cpp b/test/SemaCXX/crashes.cpp
index 926d13ab4533c..a80587d8405b3 100644
--- a/test/SemaCXX/crashes.cpp
+++ b/test/SemaCXX/crashes.cpp
@@ -105,8 +105,7 @@ namespace PR9026 {
 namespace PR10270 {
   template<typename T> class C;
   template<typename T> void f() {
-    if (C<T> == 1) // expected-error{{expected unqualified-id}} \
-                   // expected-error{{invalid '==' at end of declaration}}
+    if (C<T> == 1) // expected-error{{expected unqualified-id}}
       return;
   }
 }
diff --git a/test/SemaCXX/cstyle-cast.cpp b/test/SemaCXX/cstyle-cast.cpp
index afac6a137ec4a..2327d7b51d97c 100644
--- a/test/SemaCXX/cstyle-cast.cpp
+++ b/test/SemaCXX/cstyle-cast.cpp
@@ -84,11 +84,11 @@ void t_529_2()
   (void)(void*)((int*)0);
   (void)(volatile const void*)((const int*)0);
   (void)(A*)((B*)0);
-  (void)(A&)(*((B*)0));
+  (void)(A&)(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(const B*)((C1*)0);
-  (void)(B&)(*((C1*)0));
+  (void)(B&)(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(A*)((D*)0);
-  (void)(const A&)(*((D*)0));
+  (void)(const A&)(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)(int B::*)((int A::*)0);
   (void)(void (B::*)())((void (A::*)())0);
   (void)(A*)((E*)0); // C-style cast ignores access control
diff --git a/test/SemaCXX/cxx0x-cursory-default-delete.cpp b/test/SemaCXX/cxx0x-cursory-default-delete.cpp
index dfca17ad7cd81..17215fedf0aba 100644
--- a/test/SemaCXX/cxx0x-cursory-default-delete.cpp
+++ b/test/SemaCXX/cxx0x-cursory-default-delete.cpp
@@ -11,6 +11,7 @@ struct non_const_copy {
   non_const_copy& operator = (non_const_copy&) &;
   non_const_copy& operator = (non_const_copy&) &&;
   non_const_copy() = default; // expected-note {{not viable}}
+  int uninit_field;
 };
 non_const_copy::non_const_copy(non_const_copy&) = default; // expected-note {{not viable}}
 non_const_copy& non_const_copy::operator = (non_const_copy&) & = default; // expected-note {{not viable}}
@@ -30,6 +31,98 @@ void fn1 () {
   ncc = cncc; // expected-error {{no viable overloaded}}
 };
 
+struct no_fields { };
+struct all_init {
+  int a = 0;
+  int b = 0;
+};
+struct some_init {
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct some_init_mutable {
+  int a = 0;
+  mutable int b;
+  int c = 0;
+};
+struct some_init_def {
+  some_init_def() = default;
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct some_init_ctor {
+  some_init_ctor();
+  int a = 0;
+  int b;
+  int c = 0;
+};
+struct sub_some_init : public some_init_def { };
+struct sub_some_init_ctor : public some_init_def {
+  sub_some_init_ctor();
+};
+struct sub_some_init_ctor2 : public some_init_ctor {
+};
+struct some_init_container {
+  some_init_def sid;
+};
+struct some_init_container_ctor {
+  some_init_container_ctor();
+  some_init_def sid;
+};
+struct no_fields_container {
+  no_fields nf;
+};
+struct param_pack_ctor {
+  template <typename... T>
+  param_pack_ctor(T...);
+  int n;
+};
+struct param_pack_ctor_field {
+  param_pack_ctor ndc;
+};
+struct multi_param_pack_ctor {
+  template <typename... T, typename... U>
+  multi_param_pack_ctor(T..., U..., int f = 0);
+  int n;
+};
+struct ignored_template_ctor_and_def {
+  template <class T> ignored_template_ctor_and_def(T* f = nullptr);
+  ignored_template_ctor_and_def() = default;
+  int field;
+};
+template<bool, typename = void> struct enable_if {};
+template<typename T> struct enable_if<true, T> { typedef T type; };
+struct multi_param_pack_and_defaulted {
+  template <typename... T,
+            typename enable_if<sizeof...(T) != 0>::type* = nullptr>
+  multi_param_pack_and_defaulted(T...);
+  multi_param_pack_and_defaulted() = default;
+  int n;
+};
+
+void constobjs() {
+  const no_fields nf; // ok
+  const all_init ai; // ok
+  const some_init si; // expected-error {{default initialization of an object of const type 'const some_init' without a user-provided default constructor}}
+  const some_init_mutable sim; // ok
+  const some_init_def sid; // expected-error {{default initialization of an object of const type 'const some_init_def' without a user-provided default constructor}}
+  const some_init_ctor sic; // ok
+  const sub_some_init ssi; // expected-error {{default initialization of an object of const type 'const sub_some_init' without a user-provided default constructor}}
+  const sub_some_init_ctor ssic; // ok
+  const sub_some_init_ctor2 ssic2; // ok
+  const some_init_container sicon; // expected-error {{default initialization of an object of const type 'const some_init_container' without a user-provided default constructor}}
+  const some_init_container_ctor siconc; // ok
+  const no_fields_container nfc; // ok
+  const param_pack_ctor ppc; // ok
+  const param_pack_ctor_field ppcf; // ok
+  const multi_param_pack_ctor mppc; // ok
+  const multi_param_pack_and_defaulted mppad; // expected-error {{default initialization of an object of const type 'const multi_param_pack_and_defaulted' without a user-provided default constructor}}
+  const ignored_template_ctor_and_def itcad; // expected-error {{default initialization of an object of const type 'const ignored_template_ctor_and_def' without a user-provided default constructor}}
+
+}
+
 struct non_const_derived : non_const_copy {
   non_const_derived(const non_const_derived&) = default; // expected-error {{requires it to be non-const}}
   non_const_derived& operator =(non_const_derived&) = default;
diff --git a/test/SemaCXX/cxx0x-defaulted-functions.cpp b/test/SemaCXX/cxx0x-defaulted-functions.cpp
index 617a25716314b..16e20ff4964d8 100644
--- a/test/SemaCXX/cxx0x-defaulted-functions.cpp
+++ b/test/SemaCXX/cxx0x-defaulted-functions.cpp
@@ -150,6 +150,14 @@ namespace PR13527 {
   Y::~Y() = default; // expected-error {{definition of explicitly defaulted}}
 }
 
+namespace PR27699 {
+  struct X {
+    X();
+  };
+  X::X() = default; // expected-note {{here}}
+  X::X() = default; // expected-error {{redefinition of 'X'}}
+}
+
 namespace PR14577 {
   template<typename T>
   struct Outer {
@@ -188,3 +196,15 @@ namespace PR15597 {
   A<int> a;
   B<int> b; // expected-note {{here}}
 }
+
+namespace PR27941 {
+struct ExplicitBool {
+  ExplicitBool &operator=(bool) = default; // expected-error{{only special member functions may be defaulted}}
+  int member;
+};
+
+int fn() {
+  ExplicitBool t;
+  t = true;
+}
+}
diff --git a/test/SemaCXX/cxx11-attr-print.cpp b/test/SemaCXX/cxx11-attr-print.cpp
index 999eaed618026..12b41758c74df 100644
--- a/test/SemaCXX/cxx11-attr-print.cpp
+++ b/test/SemaCXX/cxx11-attr-print.cpp
@@ -16,6 +16,15 @@ int a __attribute__((deprecated("warning")));
 // CHECK: int b {{\[}}[gnu::deprecated("warning")]];
 int b [[gnu::deprecated("warning")]];
 
+// CHECK: __declspec(deprecated("warning"))
+__declspec(deprecated("warning")) int c;
+
+// CHECK: int d {{\[}}[deprecated("warning")]];
+int d [[deprecated("warning")]];
+
+// CHECK: __attribute__((deprecated("warning", "fixit")));
+int e __attribute__((deprecated("warning", "fixit")));
+
 // CHECK: int cxx11_alignas alignas(4);
 alignas(4) int cxx11_alignas;
 
diff --git a/test/SemaCXX/cxx11-gnu-attrs.cpp b/test/SemaCXX/cxx11-gnu-attrs.cpp
index d20617815e6a0..231be727714e8 100644
--- a/test/SemaCXX/cxx11-gnu-attrs.cpp
+++ b/test/SemaCXX/cxx11-gnu-attrs.cpp
@@ -19,6 +19,7 @@ int *[[gnu::unused]] attr_on_ptr;
 [[gnu::fastcall]] void pr17424_4() [[gnu::stdcall]];
 // expected-warning@-1 {{calling convention 'fastcall' ignored for this target}}
 // expected-warning@-2 {{attribute 'stdcall' ignored, because it cannot be applied to a type}}
+// expected-warning@-3 {{calling convention 'stdcall' ignored for this target}}
 void pr17424_5 [[gnu::fastcall]]();
 // expected-warning@-1 {{calling convention 'fastcall' ignored for this target}}
 
diff --git a/test/SemaCXX/cxx11-inheriting-ctors.cpp b/test/SemaCXX/cxx11-inheriting-ctors.cpp
index 04aa117b29dd2..9c33ac05cc5f5 100644
--- a/test/SemaCXX/cxx11-inheriting-ctors.cpp
+++ b/test/SemaCXX/cxx11-inheriting-ctors.cpp
@@ -34,3 +34,25 @@ namespace WrongIdent {
     using B::A;
   };
 }
+
+namespace DefaultCtorConflict {
+  struct A { A(int = 0); };
+  struct B : A {
+    using A::A;
+  } b; // ok, not ambiguous, inherited constructor suppresses implicit default constructor
+  struct C {
+    B b;
+  } c;
+}
+
+namespace InvalidConstruction {
+  struct A { A(int); };
+  struct B { B() = delete; };
+  struct C : A, B { using A::A; };
+  // Initialization here is performed as if by a defaulted default constructor,
+  // which would be ill-formed (in the immediate context) in this case because
+  // it would be defined as deleted.
+  template<typename T> void f(decltype(T(0))*);
+  template<typename T> int &f(...);
+  int &r = f<C>(0);
+}
diff --git a/test/SemaCXX/cxx1y-deduced-return-type.cpp b/test/SemaCXX/cxx1y-deduced-return-type.cpp
index 225d2348ccdc8..593ec48b4394c 100644
--- a/test/SemaCXX/cxx1y-deduced-return-type.cpp
+++ b/test/SemaCXX/cxx1y-deduced-return-type.cpp
@@ -496,3 +496,13 @@ namespace TrailingReturnTypeForConversionOperator {
     }
   };
 };
+
+namespace PR24989 {
+  auto x = [](auto){};
+  using T = decltype(x);
+  void (T::*p)(int) const = &T::operator();
+}
+
+void forinit_decltypeauto() {
+  for (decltype(auto) forinit_decltypeauto_inner();;) {} // expected-warning {{interpreted as a function}} expected-note {{replace}}
+}
diff --git a/test/SemaCXX/cxx1y-init-captures.cpp b/test/SemaCXX/cxx1y-init-captures.cpp
index d36882d8e5d14..d681954707d06 100644
--- a/test/SemaCXX/cxx1y-init-captures.cpp
+++ b/test/SemaCXX/cxx1y-init-captures.cpp
@@ -196,3 +196,13 @@ namespace N3922 {
   auto a = [x{X()}] { return x.n; }; // ok
   auto b = [x = {X()}] {}; // expected-error{{<initializer_list>}}
 }
+
+namespace init_capture_non_mutable {
+void test(double weight) {
+  double init;
+  auto find = [max = init](auto current) {
+    max = current; // expected-error{{cannot assign to a variable captured by copy in a non-mutable lambda}}
+  };
+  find(weight); // expected-note {{in instantiation of function template specialization}}
+}
+}
diff --git a/test/SemaCXX/cxx1y-variable-templates_in_class.cpp b/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
index 65e2d6b608c2b..e2fbdfd699543 100644
--- a/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
+++ b/test/SemaCXX/cxx1y-variable-templates_in_class.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -verify -fsyntax-only %s -Wno-c++11-extensions -Wno-c++1y-extensions -DPRECXX11
+// RUN: %clang_cc1 -std=c++98 -verify -fsyntax-only %s -Wno-c++11-extensions -Wno-c++1y-extensions -DPRECXX11
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -Wno-c++1y-extensions %s
-// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only %s
+// RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only %s -DCPP1Y
 
 #define CONST const
 
@@ -338,3 +338,47 @@ namespace b20896909 {
     A<int> ai;  // expected-note {{in instantiation of}}
   }
 }
+namespace member_access_is_ok {
+#ifdef CPP1Y
+  namespace ns1 {
+    struct A {
+      template<class T, T N> constexpr static T Var = N;
+    };
+    static_assert(A{}.Var<int,5> == 5,"");
+  } // end ns1
+#endif // CPP1Y
+
+namespace ns2 {
+  template<class T> struct A {
+    template<class U, T N, U M> static T&& Var;
+  };
+  template<class T> template<class U, T N, U M> T&& A<T>::Var = T(N + M);
+  int *AV = &A<int>().Var<char, 5, 'A'>;
+  
+} //end ns2
+} // end ns member_access_is_ok
+
+#ifdef CPP1Y
+namespace PR24473 {
+struct Value
+{
+    template<class T>
+    static constexpr T value = 0;
+};
+
+template<typename TValue>
+struct Something
+{
+    void foo() {
+        static_assert(TValue::template value<int> == 0, ""); // error
+    }
+};
+
+int main() { 
+    Something<Value>{}.foo();
+    return 0;
+}
+
+} // end ns PR24473
+#endif // CPP1Y
+
diff --git a/test/SemaCXX/cxx1y-variable-templates_top_level.cpp b/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
index 787868fae1764..496ae888732f1 100644
--- a/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
+++ b/test/SemaCXX/cxx1y-variable-templates_top_level.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -Wno-c++11-extensions -Wno-c++1y-extensions %s -DPRECXX11
+// RUN: %clang_cc1 -std=c++98 -verify -fsyntax-only -Wno-c++11-extensions -Wno-c++1y-extensions %s -DPRECXX11
 // RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -Wno-c++1y-extensions %s
 // RUN: %clang_cc1 -std=c++1y -verify -fsyntax-only %s
 
@@ -458,3 +458,9 @@ namespace PR19169 {
   template<> int g<double>; // expected-error {{no variable template matches specialization; did you mean to use 'g' as function template instead?}}
 }
 
+#ifndef PRECXX11
+template <typename... Args> struct Variadic_t { };
+template <typename... Args> Variadic_t<Args...> Variadic;
+auto variadic1 = Variadic<>;
+auto variadic2 = Variadic<int, int>;
+#endif
diff --git a/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
new file mode 100644
index 0000000000000..8e7657fb6e8d5
--- /dev/null
+++ b/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks %s
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s 
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s 
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s 
+
+namespace test_constexpr_checking {
+
+namespace ns1 {
+  struct NonLit { ~NonLit(); };  //expected-note{{not literal}}
+  auto L = [](NonLit NL) constexpr { }; //expected-error{{not a literal type}}
+} // end ns1
+
+namespace ns2 {
+  auto L = [](int I) constexpr { asm("non-constexpr");  }; //expected-error{{not allowed in constexpr function}}
+} // end ns1
+
+} // end ns test_constexpr_checking
+
+namespace test_constexpr_call {
+
+namespace ns1 {
+  auto L = [](int I) { return I; };
+  static_assert(L(3) == 3);
+} // end ns1
+namespace ns2 {
+  auto L = [](auto a) { return a; };
+  static_assert(L(3) == 3);
+  static_assert(L(3.14) == 3.14);
+}
+namespace ns3 {
+  auto L = [](auto a) { asm("non-constexpr"); return a; }; //expected-note{{declared here}}
+  constexpr int I =  //expected-error{{must be initialized by a constant expression}}
+      L(3); //expected-note{{non-constexpr function}}
+} 
+
+} // end ns test_constexpr_call
+\ No newline at end of file
diff --git a/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp b/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp
new file mode 100644
index 0000000000000..5390da4f3f5a6
--- /dev/null
+++ b/test/SemaCXX/cxx1z-init-statement-warn-unused.cpp
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -std=c++1z -verify -Wuninitialized %s
+
+void testIf() {
+  if (bool b; b) // expected-warning {{uninitialized}} expected-note {{to silence}}
+    ;
+  if (int a, b = 2; a) // expected-warning {{uninitialized}} expected-note {{to silence}}
+    ;
+  int a;
+  if (a = 0; a) {} // OK
+}
+
+void testSwitch() {
+  switch (bool b; b) { // expected-warning {{uninitialized}} expected-warning {{boolean value}} expected-note {{to silence}}
+    case 0:
+      break;
+  }
+  switch (int a, b = 7; a) { // expected-warning {{uninitialized}} expected-note {{to silence}}
+    case 0:
+      break;
+  }
+  int c;
+  switch (c = 0; c) { // OK
+    case 0:
+      break;
+  }
+}
diff --git a/test/SemaCXX/cxx1z-init-statement.cpp b/test/SemaCXX/cxx1z-init-statement.cpp
new file mode 100644
index 0000000000000..4afe0402d13f2
--- /dev/null
+++ b/test/SemaCXX/cxx1z-init-statement.cpp
@@ -0,0 +1,91 @@
+// RUN: %clang_cc1 -std=c++1z -verify %s
+
+void testIf() {
+  int x = 0;
+  if (x; x) ++x;
+  if (int t = 0; t) ++t; else --t;
+
+  if (int x, y = 0; y) // expected-note 2 {{previous definition is here}}
+    int x = 0; // expected-error {{redefinition of 'x'}}
+  else
+    int x = 0; // expected-error {{redefinition of 'x'}}
+
+  if (x; int a = 0) ++a;
+  if (x, +x; int a = 0) // expected-note 2 {{previous definition is here}} expected-warning {{unused}}
+    int a = 0; // expected-error {{redefinition of 'a'}}
+  else
+    int a = 0; // expected-error {{redefinition of 'a'}}
+
+  if (int b = 0; b)
+    ;
+  b = 2; // expected-error {{use of undeclared identifier}}
+}
+
+void testSwitch() {
+  int x = 0;
+  switch (x; x) {
+    case 1:
+      ++x;
+  }
+
+  switch (int x, y = 0; y) {
+    case 1:
+      ++x;
+    default:
+      ++y;
+  }
+
+  switch (int x, y = 0; y) { // expected-note 2 {{previous definition is here}}
+    case 0:
+      int x = 0; // expected-error {{redefinition of 'x'}}
+    case 1:
+      int y = 0; // expected-error {{redefinition of 'y'}}
+  };
+
+  switch (x; int a = 0) {
+    case 0:
+      ++a;
+  }
+
+  switch (x, +x; int a = 0) { // expected-note {{previous definition is here}} expected-warning {{unused}}
+    case 0:
+      int a = 0; // expected-error {{redefinition of 'a'}} // expected-note {{previous definition is here}}
+    case 1:
+      int a = 0; // expected-error {{redefinition of 'a'}}
+  }
+
+  switch (int b = 0; b) {
+    case 0:
+      break;
+  }
+  b = 2; // expected-error {{use of undeclared identifier}}
+}
+
+constexpr bool constexpr_if_init(int n) {
+  if (int a = n; ++a > 0)
+    return true;
+  else
+    return false;
+}
+
+constexpr int constexpr_switch_init(int n) {
+  switch (int p = n + 2; p) {
+    case 0:
+      return 0;
+    case 1:
+      return 1;
+    default:
+      return -1;
+  }
+}
+
+void test_constexpr_init_stmt() {
+  constexpr bool a = constexpr_if_init(-2);
+  static_assert(!a, "");
+  static_assert(constexpr_if_init(1), "");
+
+  constexpr int b = constexpr_switch_init(-1);
+  static_assert(b == 1, "");
+  static_assert(constexpr_switch_init(-2) == 0, "");
+  static_assert(constexpr_switch_init(-5) == -1, "");
+}
diff --git a/test/SemaCXX/cxx1z-lambda-star-this.cpp b/test/SemaCXX/cxx1z-lambda-star-this.cpp
new file mode 100644
index 0000000000000..c23dd7b427195
--- /dev/null
+++ b/test/SemaCXX/cxx1z-lambda-star-this.cpp
@@ -0,0 +1,231 @@
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing %s -DDELAYED_TEMPLATE_PARSING
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fms-extensions %s -DMS_EXTENSIONS
+// RUN: %clang_cc1 -std=c++1z -verify -fsyntax-only -fblocks -fdelayed-template-parsing -fms-extensions %s -DMS_EXTENSIONS -DDELAYED_TEMPLATE_PARSING
+
+template<class, class> constexpr bool is_same = false;
+template<class T> constexpr bool is_same<T, T> = true;
+
+namespace test_star_this {
+namespace ns1 {
+class A {
+  int x = 345;
+  auto foo() {
+    (void) [*this, this] { };  //expected-error{{'this' can appear only once}}
+    (void) [this] { ++x; };
+    (void) [*this] { ++x; };  //expected-error{{read-only variable}}
+    (void) [*this] () mutable { ++x; };
+    (void) [=] { return x; };
+    (void) [&, this] { return x; };
+    (void) [=, *this] { return x; };
+    (void) [&, *this] { return x; };
+  }
+};
+} // end ns1
+
+namespace ns2 {
+  class B {
+    B(const B&) = delete; //expected-note{{deleted here}}
+    int *x = (int *) 456;
+    void foo() {
+      (void)[this] { return x; };
+      (void)[*this] { return x; }; //expected-error{{call to deleted}}
+    }
+  };
+} // end ns2
+namespace ns3 {
+  class B {
+    B(const B&) = delete; //expected-note2{{deleted here}}
+    
+    int *x = (int *) 456;
+    public: 
+    template<class T = int>
+    void foo() {
+      (void)[this] { return x; };
+      (void)[*this] { return x; }; //expected-error2{{call to deleted}}
+    }
+    
+    B() = default;
+  } b;
+  B *c = (b.foo(), nullptr); //expected-note{{in instantiation}}
+} // end ns3
+
+namespace ns4 {
+template<class U>
+class B {
+  B(const B&) = delete; //expected-note{{deleted here}}
+  double d = 3.14;
+  public: 
+  template<class T = int>
+  auto foo() {
+    const auto &L = [*this] (auto a) mutable { //expected-error{{call to deleted}}
+      d += a; 
+      return [this] (auto b) { return d +=b; }; 
+    }; 
+  }
+  
+  B() = default;
+};
+void main() {
+  B<int*> b;
+  b.foo(); //expected-note{{in instantiation}}
+} // end main  
+} // end ns4
+namespace ns5 {
+
+struct X {
+  double d = 3.14;
+  X(const volatile X&);
+  void foo() {
+      
+  }
+  
+  void foo() const { //expected-note{{const}}
+    
+    auto L = [*this] () mutable { 
+      static_assert(is_same<decltype(this), X*>);
+      ++d;
+      auto M = [this] { 
+        static_assert(is_same<decltype(this), X*>);  
+        ++d;
+        auto N = [] {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+      };
+    };
+    
+    auto L1 = [*this] { 
+      static_assert(is_same<decltype(this), const X*>);
+      auto M = [this] () mutable { 
+        static_assert(is_same<decltype(this), const X*>);  
+        auto N = [] {
+          static_assert(is_same<decltype(this), const X*>); 
+        };
+      };
+      auto M2 = [*this] () mutable { 
+        static_assert(is_same<decltype(this), X*>);  
+        auto N = [] {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+      };
+    };
+    
+    auto GL1 = [*this] (auto a) { 
+      static_assert(is_same<decltype(this), const X*>);
+      auto M = [this] (auto b) mutable { 
+        static_assert(is_same<decltype(this), const X*>);  
+        auto N = [] (auto c) {
+          static_assert(is_same<decltype(this), const X*>); 
+        };
+        return N;
+      };
+      
+      auto M2 = [*this] (auto a) mutable { 
+        static_assert(is_same<decltype(this), X*>);  
+        auto N = [] (auto b) {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+        return N;
+      };
+      return [=](auto a) mutable { M(a)(a); M2(a)(a); };
+    };
+    
+    GL1("abc")("abc");
+    
+    
+    auto L2 = [this] () mutable {
+      static_assert(is_same<decltype(this), const X*>);  
+      ++d; //expected-error{{cannot assign}}
+    };
+    auto GL = [*this] (auto a) mutable {
+      static_assert(is_same<decltype(this), X*>);
+      ++d;
+      auto M = [this] (auto b) { 
+        static_assert(is_same<decltype(this), X*>);  
+        ++d;
+        auto N = [] (auto c) {
+          static_assert(is_same<decltype(this), X*>); 
+        };
+        N(3.14);
+      };
+      M("abc");
+    };
+    GL(3.14);
+ 
+  }
+  void foo() volatile const {
+    auto L = [this] () {
+      static_assert(is_same<decltype(this), const volatile X*>);
+      auto M = [*this] () mutable { 
+        static_assert(is_same<decltype(this), X*>);
+        auto N = [this] {
+          static_assert(is_same<decltype(this), X*>);
+          auto M = [] {
+            static_assert(is_same<decltype(this), X*>);
+          };
+        };
+        auto N2 = [*this] {
+          static_assert(is_same<decltype(this), const X*>);
+        };
+      };
+      auto M2 = [*this] () {
+        static_assert(is_same<decltype(this), const X*>); 
+        auto N = [this] {
+          static_assert(is_same<decltype(this), const X*>);
+        };
+      };
+    };
+  }
+  
+};
+
+} //end ns5
+namespace ns6 {
+struct X {
+  double d;
+  auto foo() const {
+    auto L = [*this] () mutable {
+      auto M = [=] (auto a) {
+        auto N = [this] {
+          ++d;
+          static_assert(is_same<decltype(this), X*>);
+          auto O = [*this] {
+            static_assert(is_same<decltype(this), const X*>);
+          };
+        };
+        N();
+        static_assert(is_same<decltype(this), X*>);
+      };
+      return M;
+    };
+    return L;
+  }
+}; 
+
+int main() {
+  auto L = X{}.foo();
+  auto M = L();
+  M(3.14);
+}
+} // end ns6
+namespace ns7 {
+
+struct X {
+  double d;
+  X();
+  X(const X&); 
+  X(X&) = delete;
+  auto foo() const {
+    //OK - the object used to initialize our capture is a const object and so prefers the non-deleted ctor.
+    const auto &&L = [*this] { };
+  }
+  
+}; 
+int main() {
+  X x;
+  x.foo();
+}
+} // end ns7
+
+} //end ns test_star_this
+
diff --git a/test/SemaCXX/dcl_init_aggr.cpp b/test/SemaCXX/dcl_init_aggr.cpp
index 432c116466111..2b5149cf278f9 100644
--- a/test/SemaCXX/dcl_init_aggr.cpp
+++ b/test/SemaCXX/dcl_init_aggr.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -pedantic -verify %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -pedantic -verify -std=c++11 %s
 // C++ [dcl.init.aggr]p2
 struct A { 
   int x;
@@ -9,14 +11,29 @@ struct A {
 } a1 = { 1, { 2, 3 } };
 
 struct NonAggregate {
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor (the implicit copy constructor) not viable}}
+// expected-note@-3 3 {{candidate constructor (the implicit move constructor) not viable}}
+#endif
   NonAggregate();
-
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor not viable: requires 0 arguments, but 2 were provided}}
+#endif
   int a, b;
 };
-NonAggregate non_aggregate_test = { 1, 2 }; // expected-error{{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
-
-NonAggregate non_aggregate_test2[2] = { { 1, 2 }, { 3, 4 } }; // expected-error 2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
-
+NonAggregate non_aggregate_test = { 1, 2 };
+#if __cplusplus <= 199711L
+// expected-error@-2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
+#else
+// expected-error@-4 {{no matching constructor for initialization of 'NonAggregate'}}
+#endif
+
+NonAggregate non_aggregate_test2[2] = { { 1, 2 }, { 3, 4 } };
+#if __cplusplus <= 199711L
+// expected-error@-2 2 {{non-aggregate type 'NonAggregate' cannot be initialized with an initializer list}}
+#else
+// expected-error@-4 2 {{no matching constructor for initialization of 'NonAggregate'}}
+#endif
 
 // C++ [dcl.init.aggr]p3
 A a_init = A(); 
@@ -38,20 +55,55 @@ char cv[4] = { 'a', 's', 'd', 'f', 0 }; // expected-error{{excess elements in ar
 
 // C++ [dcl.init.aggr]p7
 struct TooFew { int a; char* b; int c; }; 
-TooFew too_few = { 1, "asdf" }; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
+TooFew too_few = { 1, "asdf" };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{conversion from string literal to 'char *' is deprecated}}
+#else
+// expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'char *'}}
+#endif
+
+struct NoDefaultConstructor {
+#if __cplusplus <= 199711L
+// expected-note@-2 3 {{candidate constructor (the implicit copy constructor)}}
+// expected-note@-3 {{declared here}}
+#else
+// expected-note@-5 4 {{candidate constructor (the implicit copy constructor)}}
+// expected-note@-6 4 {{candidate constructor (the implicit move constructor)}}
+#endif
+
+  NoDefaultConstructor(int);
+#if __cplusplus <= 199711L
+  // expected-note@-2 3 {{candidate constructor not viable: requires 1 argument, but 0 were provided}}
+#else
+  // expected-note@-4 4 {{candidate constructor not viable: requires 1 argument, but 0 were provided}}
+#endif
 
-struct NoDefaultConstructor { // expected-note 3 {{candidate constructor (the implicit copy constructor)}} \
-                              // expected-note{{declared here}}
-  NoDefaultConstructor(int); // expected-note 3 {{candidate constructor}}
 };
-struct TooFewError { // expected-error{{implicit default constructor for}}
+struct TooFewError {
+#if __cplusplus <= 199711L
+// expected-error@-2 {{implicit default constructor for}}
+#endif
+
   int a;
-  NoDefaultConstructor nodef; // expected-note{{member is declared here}} expected-note 2{{in implicit initialization of field 'nodef'}}
+  NoDefaultConstructor nodef;
+#if __cplusplus <= 199711L
+// expected-note@-2 {{member is declared here}}
+// expected-note@-3 2{{in implicit initialization of field 'nodef' with omitted initializer}}
+#else
+// expected-note@-5 3{{in implicit initialization of field 'nodef' with omitted initializer}}
+#endif
 };
 TooFewError too_few_okay = { 1, 1 };
 TooFewError too_few_error = { 1 }; // expected-error{{no matching constructor}}
 
-TooFewError too_few_okay2[2] = { 1, 1 }; // expected-note{{implicit default constructor for 'TooFewError' first required here}}
+TooFewError too_few_okay2[2] = { 1, 1 };
+#if __cplusplus <= 199711L
+// expected-note@-2 {{implicit default constructor for 'TooFewError' first required here}}
+#else
+// expected-error@-4 {{no matching constructor for initialization of 'NoDefaultConstructor'}}
+// expected-note@-5 {{in implicit initialization of array element 1 with omitted initializer}}
+#endif
+
 TooFewError too_few_error2[2] = { 1 }; // expected-error{{no matching constructor}}
 
 NoDefaultConstructor too_few_error3[3] = { }; // expected-error {{no matching constructor}} expected-note {{implicit initialization of array element 0}}
@@ -116,6 +168,10 @@ B2 b2_3 = { c2, a2, a2 };
 
 // C++ [dcl.init.aggr]p15:
 union u { int a; char* b; }; // expected-note{{candidate constructor (the implicit copy constructor)}}
+#if __cplusplus >= 201103L
+// expected-note@-2 {{candidate constructor (the implicit move constructor)}}
+#endif
+
 u u1 = { 1 }; 
 u u2 = u1; 
 u u3 = 1; // expected-error{{no viable conversion}}
diff --git a/test/SemaCXX/default2.cpp b/test/SemaCXX/default2.cpp
index c4d40b4280e96..8f77f300572b9 100644
--- a/test/SemaCXX/default2.cpp
+++ b/test/SemaCXX/default2.cpp
@@ -128,3 +128,7 @@ S<1> s;
 
 template <int I1 = I2, int I2 = 1> struct T {};  // expected-error-re {{use of undeclared identifier 'I2'{{$}}}}
 T<0, 1> t;
+
+struct PR28105 {
+  PR28105 (int = 0, int = 0, PR28105 = 0);  // expected-error{{recursive evaluation of default argument}}
+};
diff --git a/test/SemaCXX/delete-and-function-templates.cpp b/test/SemaCXX/delete-and-function-templates.cpp
new file mode 100644
index 0000000000000..22e95cb7937a4
--- /dev/null
+++ b/test/SemaCXX/delete-and-function-templates.cpp
@@ -0,0 +1,133 @@
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -emit-llvm-only %s
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing %s 
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fms-extensions %s 
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only  -fdelayed-template-parsing -fms-extensions %s 
+
+template<class T, class U> struct is_same { enum { value = false }; };
+template<class T> struct is_same<T, T> { enum { value = true }; };
+
+namespace test_sfinae_and_delete {
+
+namespace ns1 {
+template<class T> double f(T) = delete; //expected-note{{candidate}}
+char f(...); //expected-note{{candidate}}
+
+static_assert(is_same<decltype(f(3)),char>::value, ""); //expected-error{{call to deleted function}} expected-error{{static_assert failed}}
+
+template<class T> decltype(f(T{})) g(T); // this one sfinae's out.
+template<class T> int *g(T);
+void foo() {
+  int *ip = g(3);
+}
+} //end ns1
+
+namespace ns2 {
+template<class T> double* f(T);
+template<> double* f(double) = delete;
+
+template<class T> decltype(f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+void foo() {
+  double *dp = g(3); //expected-error{{ambiguous}}
+  int *ip = g(3.14); // this is OK - because the explicit specialization is deleted and sfinae's out one of the template candidates
+}
+
+} // end ns2
+
+namespace ns3 {
+template<class T> double* f(T) = delete;
+template<> double* f(double);
+
+template<class T> decltype(f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *dp = g(3); // this is OK - because the non-double specializations are deleted and sfinae's out one of the template candidates
+  double *ip = g(3.14); //expected-error{{ambiguous}}
+}
+
+} // end ns3
+} // end ns test_sfinae_and_delete
+
+namespace test_explicit_specialization_of_member {
+namespace ns1 {
+template<class T> struct X {
+  int* f(T) = delete;
+}; 
+template<> int* X<int>::f(int) { }
+
+template<class T> decltype(X<T>{}.f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *ip2 = g(3.14); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g(3); //expected-error{{ambiguous}}
+}
+
+} // end ns1
+
+namespace ns2 {
+struct X {
+template<class T> double* f(T) = delete;
+}; 
+template<> double* X::f(int);
+
+template<class T> decltype(X{}.f(T{})) g(T); // expected-note{{candidate}}
+template<class T> int *g(T); //expected-note{{candidate}}
+
+void foo() {
+  int *ip2 = g(3.14); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g(3); //expected-error{{ambiguous}}
+}
+
+} // end ns2
+
+namespace ns3 {
+template<class T> struct X {
+  template<class U> double *f1(U, T) = delete;
+  template<class U> double *f2(U, T) = delete;
+};
+template<> template<> double* X<int>::f1(int, int);
+template<> template<class U> double* X<int>::f2(U, int);
+
+template<class T, class U> decltype(X<T>{}.f1(U{}, T{})) g1(U, T); // expected-note{{candidate}}
+template<class T, class U> int *g1(U, T); //expected-note{{candidate}}
+
+template<class T, class U> decltype(X<T>{}.f2(U{}, T{})) g2(U, T); // expected-note2{{candidate}}
+template<class T, class U> int *g2(U, T); //expected-note2{{candidate}}
+
+
+void foo() {
+  int *ip2 = g1(3.14, 3); // this is OK - because the non-int specializations are deleted and sfinae's out one of the template candidates
+  int *ip = g1(3, 3); //expected-error{{ambiguous}}
+  {
+   int *ip3 = g2(3.14, 3); //expected-error{{ambiguous}}
+   int *ip4 = g2(3, 3); //expected-error{{ambiguous}}
+  }
+  {
+   int *ip3 = g2(3.14, 3.14); 
+   int *ip4 = g2(3, 3.14); 
+  }
+}
+
+
+} // end ns3
+
+namespace ns4 {
+template < typename T> T* foo (T);
+template <> int* foo(int) = delete;
+template <> int* foo(int); //expected-note{{candidate}}
+
+int *IP = foo(2); //expected-error{{deleted}}
+double *DP = foo(3.14);
+} //end ns4
+
+namespace ns5 {
+template < typename T> T* foo (T);
+template <> int* foo(int); //expected-note{{previous}}
+template <> int* foo(int) = delete; //expected-error{{deleted definition must be first declaration}}
+
+} //end ns5
+
+
+} // end test_explicit_specializations_and_delete
diff --git a/test/SemaCXX/deleted-operator.cpp b/test/SemaCXX/deleted-operator.cpp
index df67978a36d6c..f71e83aa25870 100644
--- a/test/SemaCXX/deleted-operator.cpp
+++ b/test/SemaCXX/deleted-operator.cpp
@@ -9,7 +9,7 @@ int PR10757f() {
   PR10757 a1;
   // FIXME: We get a ridiculous number of "built-in candidate" notes here...
   if(~a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 8 {{built-in candidate}}
-  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 121 {{built-in candidate}}
+  if(a1==a1) {} // expected-error {{overload resolution selected deleted operator}} expected-note 144 {{built-in candidate}}
 }
 
 struct DelOpDel {
diff --git a/test/SemaCXX/destructor.cpp b/test/SemaCXX/destructor.cpp
index e7323f90b3f75..fe1dde5771ace 100644
--- a/test/SemaCXX/destructor.cpp
+++ b/test/SemaCXX/destructor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -triple %itanium_abi_triple -fsyntax-only -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -verify %s
+// RUN: %clang_cc1 -std=c++11 -triple %itanium_abi_triple -fsyntax-only -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -fcxx-exceptions -verify %s
 // RUN: %clang_cc1 -std=c++11 -triple %ms_abi_triple -DMSABI -fsyntax-only -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -verify %s
 class A {
 public:
@@ -257,6 +257,7 @@ void nowarnnonpoly() {
   }
 }
 
+// FIXME: Why are these supposed to not warn?
 void nowarnarray() {
   {
     B* b = new B[4];
@@ -311,6 +312,14 @@ void nowarn0() {
   }
 }
 
+void nowarn0_explicit_dtor(F* f, VB* vb, VD* vd, VF* vf) {
+  f->~F();
+  f->~F();
+  vb->~VB();
+  vd->~VD();
+  vf->~VF();
+}
+
 void warn0() {
   {
     B* b = new B();
@@ -326,6 +335,17 @@ void warn0() {
   }
 }
 
+void warn0_explicit_dtor(B* b, B& br, D* d) {
+  b->~B(); // expected-warning {{destructor called on non-final 'dnvd::B' that has virtual functions but non-virtual destructor}} expected-note{{qualify call to silence this warning}}
+  b->B::~B(); // No warning when the call isn't virtual.
+
+  br.~B(); // expected-warning {{destructor called on non-final 'dnvd::B' that has virtual functions but non-virtual destructor}} expected-note{{qualify call to silence this warning}}
+  br.B::~B();
+
+  d->~D(); // expected-warning {{destructor called on non-final 'dnvd::D' that has virtual functions but non-virtual destructor}} expected-note{{qualify call to silence this warning}}
+  d->D::~D();
+}
+
 void nowarn1() {
   {
     simple_ptr<F> f(new F());
@@ -403,3 +423,11 @@ void g(S s) {
   (s.~S); // expected-error{{reference to destructor must be called}}
 }
 }
+
+class Invalid {
+    ~Invalid();
+    UnknownType xx; // expected-error{{unknown type name}}
+};
+
+// The constructor definition should not have errors
+Invalid::~Invalid() {}
diff --git a/test/SemaCXX/diagnostic-order.cpp b/test/SemaCXX/diagnostic-order.cpp
new file mode 100644
index 0000000000000..4ced22c92ac67
--- /dev/null
+++ b/test/SemaCXX/diagnostic-order.cpp
@@ -0,0 +1,73 @@
+// RUN: not %clang_cc1 -std=c++11 %s -fsyntax-only 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -std=c++11 %s -fsyntax-only -DWARN 2>&1 | FileCheck %s --check-prefix=CHECK-WARN
+
+#ifndef WARN
+
+// Ensure that the diagnostics we produce for this situation appear in a
+// deterministic order. This requires ADL to provide lookup results in a
+// deterministic order.
+template<typename T, typename> struct Error { typedef typename T::error error; };
+struct X { template<typename T> friend typename Error<X, T>::error f(X, T); };
+struct Y { template<typename T> friend typename Error<Y, T>::error f(T, Y); };
+
+void g() {
+  f(X(), Y());
+}
+
+// We don't really care which order these two diagnostics appear (although the
+// order below is source order, which seems best). The crucial fact is that
+// there is one single order that is stable across multiple runs of clang.
+//
+// CHECK: no type named 'error' in 'X'
+// CHECK: no type named 'error' in 'Y'
+// CHECK: no matching function for call to 'f'
+
+
+struct Oper {
+  template<typename T, typename U = typename Error<Oper, T>::error> operator T();
+
+  operator int*();
+  operator float*();
+  operator X*();
+  operator Y*();
+
+  operator int(*[1])();
+  operator int(*[2])();
+  operator int(*[3])();
+  operator int(*[4])();
+  operator int(*[5])();
+  operator int(*[6])();
+  operator int(*[7])();
+  operator int(*[8])();
+  operator float(*[1])();
+  operator float(*[2])();
+  operator float(*[3])();
+  operator float(*[4])();
+  operator float(*[5])();
+  operator float(*[6])();
+  operator float(*[7])();
+  operator float(*[8])();
+};
+int *p = Oper() + 0;
+
+// CHECK: no type named 'error' in 'Oper'
+// CHECK: in instantiation of template class 'Error<Oper, int *>'
+// CHECK: no type named 'error' in 'Oper'
+// CHECK: in instantiation of template class 'Error<Oper, float *>'
+// CHECK: no type named 'error' in 'Oper'
+// CHECK: in instantiation of template class 'Error<Oper, X *>'
+// CHECK: no type named 'error' in 'Oper'
+// CHECK: in instantiation of template class 'Error<Oper, Y *>'
+
+#endif
+
+template<typename T> struct UndefButUsed {
+  static inline int f();
+  static int g() { return f(); }
+};
+int undef_but_used = UndefButUsed<int>::g() + UndefButUsed<float>::g() + UndefButUsed<char>::g() + UndefButUsed<void>::g();
+
+// CHECK-WARN: inline function 'UndefButUsed<int>::f' is not defined
+// CHECK-WARN: inline function 'UndefButUsed<float>::f' is not defined
+// CHECK-WARN: inline function 'UndefButUsed<char>::f' is not defined
+// CHECK-WARN: inline function 'UndefButUsed<void>::f' is not defined
diff --git a/test/SemaCXX/dllexport.cpp b/test/SemaCXX/dllexport.cpp
index 0bbf9b370b4de..b4850fc03d9b9 100644
--- a/test/SemaCXX/dllexport.cpp
+++ b/test/SemaCXX/dllexport.cpp
@@ -16,13 +16,19 @@ struct External { int v; };
 
 
 // Invalid usage.
-__declspec(dllexport) typedef int typedef1; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllexport) int typedef2; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef int __declspec(dllexport) typedef3; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllexport) void (*FunTy)(); // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
-enum __declspec(dllexport) Enum {}; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
+enum __declspec(dllexport) Enum {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
 #if __has_feature(cxx_strong_enums)
-  enum class __declspec(dllexport) EnumClass {}; // expected-warning{{'dllexport' attribute only applies to variables, functions and classes}}
+enum class __declspec(dllexport) EnumClass {};
+// expected-warning@-1{{'dllexport' attribute only applies to variables, functions and classes}}
 #endif
 
 
diff --git a/test/SemaCXX/dllimport.cpp b/test/SemaCXX/dllimport.cpp
index 5d8ce78f6cdcb..36a8ac625ac30 100644
--- a/test/SemaCXX/dllimport.cpp
+++ b/test/SemaCXX/dllimport.cpp
@@ -15,13 +15,19 @@ namespace { struct Internal {}; }
 
 
 // Invalid usage.
-__declspec(dllimport) typedef int typedef1; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllimport) int typedef2; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef int __declspec(dllimport) typedef3; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-typedef __declspec(dllimport) void (*FunTy)(); // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
-enum __declspec(dllimport) Enum {}; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
+enum __declspec(dllimport) Enum {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
 #if __has_feature(cxx_strong_enums)
-  enum class __declspec(dllimport) EnumClass {}; // expected-warning{{'dllimport' attribute only applies to variables, functions and classes}}
+enum class __declspec(dllimport) EnumClass {};
+// expected-warning@-1{{'dllimport' attribute only applies to variables, functions and classes}}
 #endif
 
 
@@ -44,17 +50,49 @@ __declspec(dllimport) int GlobalInit1 = 1; // expected-error{{definition of dlli
 int __declspec(dllimport) GlobalInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int ExternGlobalDeclInit = 1; // expected-warning{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) extern int ExternGlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ExternGlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int ExternGlobalDeclInit = 1;
 
-__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclInit = 1; // expected-warning{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+__declspec(dllimport) int GlobalDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclInit = 1;
 
-int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int *GlobalDeclChunkAttrInit = 0; // expected-warning{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int *__attribute__((dllimport)) GlobalDeclChunkAttrInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclChunkAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int *GlobalDeclChunkAttrInit = 0;
 
-int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-int GlobalDeclAttrInit = 1; // expected-warning{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+int GlobalDeclAttrInit __attribute__((dllimport)); // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+4{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'GlobalDeclAttrInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+int GlobalDeclAttrInit = 1;
 
 // Redeclarations
 __declspec(dllimport) extern int GlobalRedecl1;
@@ -69,8 +107,6 @@ int *__attribute__((dllimport)) GlobalRedecl2b;
 int GlobalRedecl2c __attribute__((dllimport));
 int GlobalRedecl2c __attribute__((dllimport));
 
-// NB: MSVC issues a warning and makes GlobalRedecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) extern int GlobalRedecl3; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       extern int GlobalRedecl3; // expected-warning{{'GlobalRedecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
@@ -135,11 +171,31 @@ template<typename T> __declspec(dllimport) int VarTmplInit1 = 1; // expected-err
 template<typename T> int __declspec(dllimport) VarTmplInit2 = 1; // expected-error{{definition of dllimport data}}
 
 // Declare, then reject definition.
-template<typename T> __declspec(dllimport) extern int ExternVarTmplDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-template<typename T>                              int ExternVarTmplDeclInit = 1; // expected-warning{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+3{{previous attribute is here}}
+#endif
+template <typename T>
+__declspec(dllimport) extern int ExternVarTmplDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+5{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ExternVarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+int ExternVarTmplDeclInit = 1;
 
-template<typename T> __declspec(dllimport) int VarTmplDeclInit; // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-template<typename T>                       int VarTmplDeclInit = 1; // expected-warning{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+// expected-note@+3{{previous attribute is here}}
+#endif
+template <typename T>
+__declspec(dllimport) int VarTmplDeclInit; // expected-note{{previous declaration is here}}
+#ifdef MS
+// expected-warning@+5{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'VarTmplDeclInit' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+int VarTmplDeclInit = 1;
 
 // Redeclarations
 template<typename T> __declspec(dllimport) extern int VarTmplRedecl1;
@@ -238,13 +294,20 @@ __declspec(dllimport) void inlineDef();
 __declspec(dllimport) void redecl1();
 __declspec(dllimport) void redecl1();
 
-// NB: MSVC issues a warning and makes redecl2/redecl3 dllexport. We follow GCC
-// and drop the dllimport with a warning.
 __declspec(dllimport) void redecl2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
                       void redecl2(); // expected-warning{{'redecl2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
 
-__declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-                      void redecl3() {} // expected-warning{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef GNU
+                      // expected-note@+2{{previous attribute is here}}
+#endif
+                      __declspec(dllimport) void redecl3(); // expected-note{{previous declaration is here}}
+                      // NB: Both MSVC and Clang issue a warning and make redecl3 dllexport.
+#ifdef MS
+                      // expected-warning@+4{{'redecl3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'redecl3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void redecl3() {}
 
                       void redecl4(); // expected-note{{previous declaration is here}}
 __declspec(dllimport) void redecl4(); // expected-warning{{redeclaration of 'redecl4' should not add 'dllimport' attribute}}
@@ -266,7 +329,10 @@ __declspec(dllimport) inline void redecl6() {} // expected-warning{{'dllimport'
 struct FuncFriend {
   friend __declspec(dllimport) void friend1();
   friend __declspec(dllimport) void friend2(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
-  friend __declspec(dllimport) void friend3(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  friend __declspec(dllimport) void friend3(); // expected-note{{previous declaration is here}}
   friend                       void friend4(); // expected-note{{previous declaration is here}}
 #ifdef MS
 // expected-note@+2{{previous declaration is here}}
@@ -275,7 +341,12 @@ struct FuncFriend {
 };
 __declspec(dllimport) void friend1();
                       void friend2(); // expected-warning{{'friend2' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-                      void friend3() {} // expected-warning{{'friend3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+                      // expected-warning@+4{{'friend3' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                      // expected-warning@+2{{'friend3' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+                      void friend3() {}
 __declspec(dllimport) void friend4(); // expected-warning{{redeclaration of 'friend4' should not add 'dllimport' attribute}}
 #ifdef MS
 __declspec(dllimport) inline void friend5() {} // expected-warning{{redeclaration of 'friend5' should not add 'dllimport' attribute}}
@@ -447,33 +518,39 @@ template<> __declspec(dllimport) inline void funcTmpl<ExplicitSpec_InlineDef_Imp
 struct ImportMembers {
   struct Nested {
     __declspec(dllimport) void normalDecl();
-    __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+    __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   };
 
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport)                void normalDecl();
-  __declspec(dllimport)                void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport)                void normalInclass() {}
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport)         inline void normalInlineDecl();
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport) virtual        void virtualDecl();
-  __declspec(dllimport) virtual        void virtualDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) virtual void virtualDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) virtual        void virtualInclass() {}
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) virtual inline void virtualInlineDecl();
 #ifdef GNU
+// expected-note@+5{{previous attribute is here}}
 // expected-warning@+5{{'dllimport' attribute ignored on inline function}}
 // expected-warning@+6{{'dllimport' attribute ignored on inline function}}
 #endif
   __declspec(dllimport) static         void staticDecl();
-  __declspec(dllimport) static         void staticDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+  __declspec(dllimport) static void staticDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) static         void staticInclass() {}
   __declspec(dllimport) static         void staticInlineDef();
   __declspec(dllimport) static  inline void staticInlineDecl();
@@ -495,20 +572,40 @@ public:
   __declspec(dllimport) constexpr static int ConstexprFieldDef = 1; // expected-note{{attribute is here}}
 };
 
-       void ImportMembers::Nested::normalDef() {} // expected-warning{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-       void ImportMembers::normalDef() {} // expected-warning{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+4{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::Nested::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+void ImportMembers::Nested::normalDef() {}
+#ifdef MS
+// expected-warning@+4{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+void ImportMembers::normalDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::normalInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 inline void ImportMembers::normalInlineDef() {}
        void ImportMembers::normalInlineDecl() {}
-       void ImportMembers::virtualDef() {} // expected-warning{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+       // expected-warning@+4{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+       void ImportMembers::virtualDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::virtualInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 inline void ImportMembers::virtualInlineDef() {}
        void ImportMembers::virtualInlineDecl() {}
-       void ImportMembers::staticDef() {} // expected-warning{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+       // expected-warning@+4{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+                                                                                 // expected-warning@+2{{'ImportMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+       void ImportMembers::staticDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportMembers::staticInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
@@ -620,7 +717,10 @@ struct ImportDefaulted {
 // Import defaulted member function definitions.
 struct ImportDefaultedDefs {
   __declspec(dllimport) ImportDefaultedDefs();
-  __declspec(dllimport) ~ImportDefaultedDefs(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) ~ImportDefaultedDefs(); // expected-note{{previous declaration is here}}
 
 #ifdef GNU
 // expected-warning@+3{{'dllimport' attribute ignored on inline function}}
@@ -630,14 +730,22 @@ struct ImportDefaultedDefs {
   __declspec(dllimport) ImportDefaultedDefs& operator=(const ImportDefaultedDefs&);
 
   __declspec(dllimport) ImportDefaultedDefs(ImportDefaultedDefs&&);
-  __declspec(dllimport) ImportDefaultedDefs& operator=(ImportDefaultedDefs&&); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) ImportDefaultedDefs &operator=(ImportDefaultedDefs &&); // expected-note{{previous declaration is here}}
 };
 
 // Not allowed on definitions.
 __declspec(dllimport) ImportDefaultedDefs::ImportDefaultedDefs() = default; // expected-error{{dllimport cannot be applied to non-inline function definition}}
 
+#ifdef MS
+// expected-warning@+5{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
 // dllimport cannot be dropped.
-ImportDefaultedDefs::~ImportDefaultedDefs() = default; // expected-warning{{'ImportDefaultedDefs::~ImportDefaultedDefs' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+ImportDefaultedDefs::~ImportDefaultedDefs() = default;
 
 // Import inline declaration and definition.
 #ifdef GNU
@@ -648,8 +756,12 @@ __declspec(dllimport) ImportDefaultedDefs::ImportDefaultedDefs(const ImportDefau
 inline ImportDefaultedDefs& ImportDefaultedDefs::operator=(const ImportDefaultedDefs&) = default;
 
 __declspec(dllimport) ImportDefaultedDefs::ImportDefaultedDefs(ImportDefaultedDefs&&) = default; // expected-error{{dllimport cannot be applied to non-inline function definition}}
-ImportDefaultedDefs& ImportDefaultedDefs::operator=(ImportDefaultedDefs&&) = default; // expected-warning{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
-
+#ifdef MS
+// expected-warning@+4{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+2{{'ImportDefaultedDefs::operator=' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+ImportDefaultedDefs &ImportDefaultedDefs::operator=(ImportDefaultedDefs &&) = default;
 
 // Redeclarations cannot add dllimport.
 struct MemberRedecl {
@@ -970,13 +1082,22 @@ template<> __declspec(dllimport) const int MemVarTmpl::StaticVar<ExplicitSpec_De
 template<typename T>
 struct ImportClassTmplMembers {
   __declspec(dllimport)                void normalDecl();
-  __declspec(dllimport)                void normalDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) void normalDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport)                void normalInlineDef();
   __declspec(dllimport) virtual        void virtualDecl();
-  __declspec(dllimport) virtual        void virtualDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) virtual void virtualDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) virtual        void virtualInlineDef();
   __declspec(dllimport) static         void staticDecl();
-  __declspec(dllimport) static         void staticDef(); // expected-note{{previous declaration is here}} expected-note{{previous attribute is here}}
+#ifdef GNU
+// expected-note@+2{{previous attribute is here}}
+#endif
+  __declspec(dllimport) static void staticDef(); // expected-note{{previous declaration is here}}
   __declspec(dllimport) static         void staticInlineDef();
 
 #ifdef GNU
@@ -1013,19 +1134,37 @@ public:
 
 // NB: MSVC is inconsistent here and disallows *InlineDef on class templates,
 // but allows it on classes. We allow both.
-template<typename T>        void ImportClassTmplMembers<T>::normalDef() {} // expected-warning{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::normalDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::normalDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::normalInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 template<typename T> inline void ImportClassTmplMembers<T>::normalInlineDef() {}
 template<typename T>        void ImportClassTmplMembers<T>::normalInlineDecl() {}
-template<typename T>        void ImportClassTmplMembers<T>::virtualDef() {} // expected-warning{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::virtualDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::virtualDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::virtualInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
 template<typename T> inline void ImportClassTmplMembers<T>::virtualInlineDef() {}
 template<typename T>        void ImportClassTmplMembers<T>::virtualInlineDecl() {}
-template<typename T>        void ImportClassTmplMembers<T>::staticDef() {} // expected-warning{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#ifdef MS
+// expected-warning@+5{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+#else
+// expected-warning@+3{{'ImportClassTmplMembers::staticDef' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+#endif
+template <typename T>
+void ImportClassTmplMembers<T>::staticDef() {}
 #ifdef GNU
 // expected-warning@+2{{'ImportClassTmplMembers::staticInlineDef' redeclared inline; 'dllimport' attribute ignored}}
 #endif
diff --git a/test/SemaCXX/enable_if.cpp b/test/SemaCXX/enable_if.cpp
index cd8241808c946..7ec07aa8b0ca3 100644
--- a/test/SemaCXX/enable_if.cpp
+++ b/test/SemaCXX/enable_if.cpp
@@ -116,9 +116,9 @@ template <typename T> class C {
   void g() { f(); }
 };
 
-int fn3(bool b) __attribute__((enable_if(b, "")));
+int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages.
 template <class T> void test3() {
-  fn3(sizeof(T) == 1);
+  fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}}
 }
 
 template <typename T>
@@ -138,7 +138,7 @@ void test4() {
 void h(int);
 template <typename T> void outer() {
   void local_function() __attribute__((enable_if(::h(T()), "")));
-  local_function();
+  local_function(); // expected-error{{no matching function for call to 'local_function'}} expected-note@-1{{candidate disabled}}
 };
 
 namespace PR20988 {
@@ -158,9 +158,9 @@ namespace PR20988 {
     fn2(expr);  // expected-error{{no matching function for call to 'fn2'}}
   }
 
-  int fn3(bool b) __attribute__((enable_if(b, "")));
+  int fn3(bool b) __attribute__((enable_if(b, ""))); // FIXME: This test should net 0 error messages.
   template <class T> void test3() {
-    fn3(sizeof(T) == 1);
+    fn3(sizeof(T) == 1); // expected-error{{no matching function for call to 'fn3'}} expected-note@-2{{candidate disabled}}
   }
 }
 
@@ -253,3 +253,167 @@ namespace FnPtrs {
     a = &noOvlNoCandidate; // expected-error{{cannot take address of function 'noOvlNoCandidate' becuase it has one or more non-tautological enable_if conditions}}
   }
 }
+
+namespace casting {
+using VoidFnTy = void (*)();
+
+void foo(void *c) __attribute__((enable_if(0, "")));
+void foo(int *c) __attribute__((enable_if(c, "")));
+void foo(char *c) __attribute__((enable_if(1, "")));
+
+void testIt() {
+  auto A = reinterpret_cast<VoidFnTy>(foo);
+  auto AAmp = reinterpret_cast<VoidFnTy>(&foo);
+
+  using VoidFooTy = void (*)(void *);
+  auto B = reinterpret_cast<VoidFooTy>(foo);
+  auto BAmp = reinterpret_cast<VoidFooTy>(&foo);
+
+  using IntFooTy = void (*)(int *);
+  auto C = reinterpret_cast<IntFooTy>(foo);
+  auto CAmp = reinterpret_cast<IntFooTy>(&foo);
+
+  using CharFooTy = void (*)(void *);
+  auto D = reinterpret_cast<CharFooTy>(foo);
+  auto DAmp = reinterpret_cast<CharFooTy>(&foo);
+}
+
+void testItCStyle() {
+  auto A = (VoidFnTy)foo;
+  auto AAmp = (VoidFnTy)&foo;
+
+  using VoidFooTy = void (*)(void *);
+  auto B = (VoidFooTy)foo;
+  auto BAmp = (VoidFooTy)&foo;
+
+  using IntFooTy = void (*)(int *);
+  auto C = (IntFooTy)foo;
+  auto CAmp = (IntFooTy)&foo;
+
+  using CharFooTy = void (*)(void *);
+  auto D = (CharFooTy)foo;
+  auto DAmp = (CharFooTy)&foo;
+}
+}
+
+namespace casting_templates {
+template <typename T> void foo(T) {} // expected-note 4 {{candidate function}}
+
+void foo(int *c) __attribute__((enable_if(c, ""))); //expected-note 4 {{candidate function}}
+void foo(char *c) __attribute__((enable_if(c, ""))); //expected-note 4 {{candidate function}}
+
+void testIt() {
+  using IntFooTy = void (*)(int *);
+  auto A = reinterpret_cast<IntFooTy>(foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+  auto ARef = reinterpret_cast<IntFooTy>(&foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+  auto AExplicit = reinterpret_cast<IntFooTy>(foo<int*>);
+
+  using CharFooTy = void (*)(char *);
+  auto B = reinterpret_cast<CharFooTy>(foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+  auto BRef = reinterpret_cast<CharFooTy>(&foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+  auto BExplicit = reinterpret_cast<CharFooTy>(foo<char*>);
+}
+
+void testItCStyle() {
+  // constexpr is usable here because all of these should become static_casts.
+  using IntFooTy = void (*)(int *);
+  constexpr auto A = (IntFooTy)foo;
+  constexpr auto ARef = (IntFooTy)&foo;
+  constexpr auto AExplicit = (IntFooTy)foo<int*>;
+
+  using CharFooTy = void (*)(char *);
+  constexpr auto B = (CharFooTy)foo;
+  constexpr auto BRef = (CharFooTy)&foo;
+  constexpr auto BExplicit = (CharFooTy)foo<char*>;
+
+  static_assert(A == ARef && ARef == AExplicit, "");
+  static_assert(B == BRef && BRef == BExplicit, "");
+}
+}
+
+namespace multiple_matches {
+using NoMatchTy = void (*)();
+
+void foo(float *c); //expected-note 4 {{candidate function}}
+void foo(int *c) __attribute__((enable_if(1, ""))); //expected-note 4 {{candidate function}}
+void foo(char *c) __attribute__((enable_if(1, ""))); //expected-note 4 {{candidate function}}
+
+void testIt() {
+  auto A = reinterpret_cast<NoMatchTy>(foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+  auto ARef = reinterpret_cast<NoMatchTy>(&foo); // expected-error{{reinterpret_cast cannot resolve overloaded function 'foo' to type}}
+
+  auto C = (NoMatchTy)foo; // expected-error{{address of overloaded function 'foo' does not match required type 'void ()'}}
+  auto CRef = (NoMatchTy)&foo; // expected-error{{address of overloaded function 'foo' does not match required type 'void ()'}}
+}
+}
+
+namespace PR27122 {
+// (slightly reduced) code that motivated the bug...
+namespace ns {
+void Function(int num)
+  __attribute__((enable_if(num != 0, "")));
+void Function(int num, int a0)
+  __attribute__((enable_if(num != 1, "")));
+}  // namespace ns
+
+using ns::Function; // expected-note 3{{declared here}}
+void Run() {
+  Functioon(0); // expected-error{{use of undeclared identifier}} expected-error{{too few arguments}}
+  Functioon(0, 1); // expected-error{{use of undeclared identifier}}
+  Functioon(0, 1, 2); // expected-error{{use of undeclared identifier}}
+}
+
+// Extra tests
+void regularEnableIf(int a) __attribute__((enable_if(a, ""))); // expected-note 3{{declared here}} expected-note 3{{candidate function not viable}}
+void runRegularEnableIf() {
+  regularEnableIf(0, 2); // expected-error{{no matching function}}
+  regularEnableIf(1, 2); // expected-error{{no matching function}}
+  regularEnableIf(); // expected-error{{no matching function}}
+
+  // Test without getting overload resolution involved
+  ::PR27122::regularEnableIf(0, 2); // expected-error{{too many arguments}}
+  ::PR27122::regularEnableIf(1, 2); // expected-error{{too many arguments}}
+  ::PR27122::regularEnableIf(); // expected-error{{too few arguments}}
+}
+
+struct Foo {
+  void bar(int i) __attribute__((enable_if(i, ""))); // expected-note 2{{declared here}}
+};
+
+void runFoo() {
+  Foo f;
+  f.bar(); // expected-error{{too few arguments}}
+  f.bar(1, 2); // expected-error{{too many arguments}}
+}
+}
+
+// Ideally, we should be able to handle value-dependent expressions sanely.
+// Sadly, that isn't the case at the moment.
+namespace dependent {
+int error(int N) __attribute__((enable_if(N, ""))); // expected-note{{candidate disabled}}
+int error(int N) __attribute__((enable_if(!N, ""))); // expected-note{{candidate disabled}}
+template <int N> int callUnavailable() {
+  return error(N); // expected-error{{no matching function for call to 'error'}}
+}
+
+constexpr int noError(int N) __attribute__((enable_if(N, ""))) { return -1; }
+constexpr int noError(int N) __attribute__((enable_if(!N, ""))) { return -1; }
+constexpr int noError(int N) { return 0; }
+
+template <int N>
+constexpr int callNoError() { return noError(N); }
+static_assert(callNoError<0>() == 0, "");
+static_assert(callNoError<1>() == 0, "");
+
+template <int N> constexpr int templated() __attribute__((enable_if(N, ""))) {
+  return 1;
+}
+
+constexpr int A = templated<0>(); // expected-error{{no matching function for call to 'templated'}} expected-note@-4{{candidate disabled}}
+static_assert(templated<1>() == 1, "");
+
+template <int N> constexpr int callTemplated() { return templated<N>(); }
+
+constexpr int B = callTemplated<0>(); // expected-error{{initialized by a constant expression}} expected-error@-2{{no matching function for call to 'templated'}} expected-note{{in instantiation of function template}} expected-note@-9{{candidate disabled}}
+static_assert(callTemplated<1>() == 1, "");
+}
diff --git a/test/SemaCXX/enum-scoped.cpp b/test/SemaCXX/enum-scoped.cpp
index 909802335e46a..142edd3893aae 100644
--- a/test/SemaCXX/enum-scoped.cpp
+++ b/test/SemaCXX/enum-scoped.cpp
@@ -298,8 +298,8 @@ namespace PR18044 {
   int E::*p; // expected-error {{does not point into a class}}
   using E::f; // expected-error {{no member named 'f'}}
 
-  using E::a; // ok!
-  E b = a;
+  using E::a; // expected-error {{using declaration cannot refer to a scoped enumerator}}
+  E b = a; // expected-error {{undeclared}}
 }
 
 namespace test11 {
diff --git a/test/SemaCXX/eval-sizeof-dependent-type.cpp b/test/SemaCXX/eval-sizeof-dependent-type.cpp
new file mode 100644
index 0000000000000..1a5564a477d80
--- /dev/null
+++ b/test/SemaCXX/eval-sizeof-dependent-type.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -std=c++11 -x c++ %s
+
+typedef __SIZE_TYPE__ size_t;
+template <typename _Tp, size_t _Nm> struct array { _Tp _M_elems[_Nm]; };
+template <typename T> struct s {
+  array<int, 1> v{static_cast<int>(sizeof (T) / sizeof(T))};
+};
+
diff --git a/test/SemaCXX/exceptions.cpp b/test/SemaCXX/exceptions.cpp
index 9802a1a1d6209..9e76783ca8a9e 100644
--- a/test/SemaCXX/exceptions.cpp
+++ b/test/SemaCXX/exceptions.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 %s
 
 struct A; // expected-note 4 {{forward declaration of 'A'}}
 
@@ -135,16 +137,29 @@ namespace Decay {
   void f() throw (int*, int());
 
   template<typename T> struct C {
-    void f() throw (T); // expected-error {{pointer to incomplete type 'Decay::E' is not allowed in exception specification}}
+    void f() throw (T);
+#if __cplusplus <= 199711L
+    // expected-error@-2 {{pointer to incomplete type 'Decay::E' is not allowed in exception specification}}
+#endif
   };
   struct D {
     C<D[10]> c;
   };
-  struct E; // expected-note {{forward declaration}}
-  C<E[10]> e; // expected-note {{in instantiation of}}
+  struct E;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{forward declaration of 'Decay::E'}}
+#endif
+
+  C<E[10]> e;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{in instantiation of template class 'Decay::C<Decay::E [10]>' requested here}}
+#endif
 }
 
-void rval_ref() throw (int &&); // expected-error {{rvalue reference type 'int &&' is not allowed in exception specification}} expected-warning {{C++11}}
+void rval_ref() throw (int &&); // expected-error {{rvalue reference type 'int &&' is not allowed in exception specification}}
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{rvalue references are a C++11 extension}}
+#endif
 
 namespace HandlerInversion {
 struct B {};
@@ -253,3 +268,17 @@ void g() {
   }
 }
 }
+
+namespace PR28047 {
+void test1(int i) {
+  try {
+  } catch (int(*)[i]) { // expected-error{{cannot catch variably modified type}}
+  }
+}
+void test2() {
+  int i;
+  try {
+  } catch (int(*)[i]) { // expected-error{{cannot catch variably modified type}}
+  }
+}
+}
diff --git a/test/SemaCXX/extern-c.cpp b/test/SemaCXX/extern-c.cpp
index 295d1f305ee28..fa6c2b1990c07 100644
--- a/test/SemaCXX/extern-c.cpp
+++ b/test/SemaCXX/extern-c.cpp
@@ -204,3 +204,41 @@ extern "C" {
   struct pr5065_n6 : public virtual pr5065_3 {};
 }
 struct pr5065_n7 {};
+
+namespace tag_hiding {
+  namespace namespace_with_injected_name {
+    class Boo {
+      friend struct ExternCStruct1;
+    };
+    void ExternCStruct4(); // expected-note 2{{candidate}}
+  }
+
+  class Baz {
+    friend struct ExternCStruct2;
+    friend void ExternCStruct3();
+  };
+
+  using namespace namespace_with_injected_name;
+
+  extern "C" {
+    struct ExternCStruct1;
+    struct ExternCStruct2;
+    struct ExternCStruct3;
+    struct ExternCStruct4; // expected-note {{candidate}}
+  }
+  ExternCStruct1 *p1;
+  ExternCStruct2 *p2;
+  ExternCStruct3 *p3;
+  ExternCStruct4 *p4; // expected-error {{ambiguous}}
+
+  extern "C" {
+    struct ExternCStruct1;
+    struct ExternCStruct2;
+    struct ExternCStruct3;
+    struct ExternCStruct4; // expected-note {{candidate}}
+  }
+  ExternCStruct1 *q1 = p1;
+  ExternCStruct2 *q2 = p2;
+  ExternCStruct3 *q3 = p3;
+  ExternCStruct4 *q4 = p4; // expected-error {{ambiguous}}
+}
diff --git a/test/SemaCXX/for-range-examples.cpp b/test/SemaCXX/for-range-examples.cpp
index 9359ae63a60b0..08a9982c6378d 100644
--- a/test/SemaCXX/for-range-examples.cpp
+++ b/test/SemaCXX/for-range-examples.cpp
@@ -176,9 +176,9 @@ namespace test4 {
 
     // Make sure these don't crash. Better diagnostics would be nice.
     for (: {1, 2, 3}) {} // expected-error {{expected expression}} expected-error {{expected ';'}}
-    for (1 : {1, 2, 3}) {} // expected-error {{must declare a variable}} expected-warning {{result unused}}
+    for (1 : {1, 2, 3}) {} // expected-error {{must declare a variable}}
     for (+x : {1, 2, 3}) {} // expected-error {{undeclared identifier}} expected-error {{expected ';'}}
-    for (+y : {1, 2, 3}) {} // expected-error {{must declare a variable}} expected-warning {{result unused}}
+    for (+y : {1, 2, 3}) {} // expected-error {{must declare a variable}}
   }
 }
 
@@ -226,7 +226,7 @@ namespace test7 {
     // we check the alignment attribute before we perform the auto
     // deduction.
     for (d alignas(1) : arr) {} // expected-error {{requires type for loop variable}}
-    for (e [[deprecated]] : arr) { e = 0; } // expected-warning{{use of the 'deprecated' attribute is a C++14 extension}} expected-warning {{deprecated}} expected-note {{here}} expected-error {{requires type for loop variable}}
+    for (e [[deprecated]] : arr) { e = 0; } // expected-warning {{deprecated}} expected-note {{here}} expected-error {{requires type for loop variable}}
   }
 }
 
diff --git a/test/SemaCXX/format-strings-0x.cpp b/test/SemaCXX/format-strings-0x.cpp
index ad57b773e0a06..7d37f8276f29f 100644
--- a/test/SemaCXX/format-strings-0x.cpp
+++ b/test/SemaCXX/format-strings-0x.cpp
@@ -15,6 +15,7 @@ void f(char **sp, float *fp) {
   scanf("%afoobar", fp);
   printf(nullptr);
   printf(*sp); // expected-warning {{not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
 
   // PR13099
   printf(
diff --git a/test/SemaCXX/format-strings.cpp b/test/SemaCXX/format-strings.cpp
index fa7251d0dd764..b7ef1d709f21f 100644
--- a/test/SemaCXX/format-strings.cpp
+++ b/test/SemaCXX/format-strings.cpp
@@ -54,6 +54,7 @@ void rdar8269537(const char *f)
   test_null_format(0); // no-warning
   test_null_format(__null); // no-warning
   test_null_format(f); // expected-warning {{not a string literal}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
 }
 
 int Foo::printf(const char *fmt, ...) {
diff --git a/test/SemaCXX/friend.cpp b/test/SemaCXX/friend.cpp
index a8e20439419fd..4f27f4df6c907 100644
--- a/test/SemaCXX/friend.cpp
+++ b/test/SemaCXX/friend.cpp
@@ -147,11 +147,13 @@ namespace test8 {
     }
     using ns2::f; // expected-note {{using declaration}}
   }
-  struct A { void f(); }; // expected-note {{target of using declaration}}
+  struct A { void f(); }; // expected-note 2{{target of using declaration}}
   struct B : public A { using A::f; }; // expected-note {{using declaration}}
+  template<typename T> struct C : A { using A::f; }; // expected-note {{using declaration}}
   struct X {
     template<class T> friend void ns1::f(T t); // expected-error {{cannot befriend target of using declaration}}
     friend void B::f(); // expected-error {{cannot befriend target of using declaration}}
+    friend void C<int>::f(); // expected-error {{cannot befriend target of using declaration}}
   };
 }
 
@@ -363,3 +365,17 @@ void g_pr6954() {
   f_pr6954(5); // expected-error{{undeclared identifier 'f_pr6954'}}
 }
 
+namespace tag_redecl {
+  namespace N {
+    struct X *p;
+    namespace {
+      class K {
+        friend struct X;
+      };
+    }
+  }
+  namespace N {
+    struct X;
+    X *q = p;
+  }
+}
diff --git a/test/SemaCXX/function-redecl.cpp b/test/SemaCXX/function-redecl.cpp
index 2bc0d90cd6277..f91e670c0c31c 100644
--- a/test/SemaCXX/function-redecl.cpp
+++ b/test/SemaCXX/function-redecl.cpp
@@ -7,7 +7,7 @@ namespace N {
     void bar(int); // expected-note 2{{previous declaration is here}}
   }
 
-  void foo(int); // expected-note 2{{previous declaration is here}}
+  void foo(int); // expected-note 3{{previous declaration is here}}
 
   void f2() {
     int foo(int); // expected-error {{functions that differ only in their return type cannot be overloaded}}
@@ -25,6 +25,13 @@ namespace N {
       }
     }
   }
+
+  void f3() {
+    int foo(float);
+    {
+      float foo(int); // expected-error {{functions that differ only in their return type cannot be overloaded}}
+    }
+  }
 }
 
 class A {
diff --git a/test/SemaCXX/functional-cast.cpp b/test/SemaCXX/functional-cast.cpp
index 9db95e80d03ed..216ee240c8388 100644
--- a/test/SemaCXX/functional-cast.cpp
+++ b/test/SemaCXX/functional-cast.cpp
@@ -126,14 +126,14 @@ void t_529_2()
   typedef A *Ap;
   (void)Ap((B*)0);
   typedef A &Ar;
-  (void)Ar(*((B*)0));
+  (void)Ar(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   typedef const B *cBp;
   (void)cBp((C1*)0);
   typedef B &Br;
-  (void)Br(*((C1*)0));
+  (void)Br(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)Ap((D*)0);
   typedef const A &cAr;
-  (void)cAr(*((D*)0));
+  (void)cAr(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   typedef int B::*Bmp;
   (void)Bmp((int A::*)0);
   typedef void (B::*Bmfp)();
diff --git a/test/SemaCXX/generalized-deprecated.cpp b/test/SemaCXX/generalized-deprecated.cpp
index 8fa20d0a0f098..43efea1ea4e6c 100644
--- a/test/SemaCXX/generalized-deprecated.cpp
+++ b/test/SemaCXX/generalized-deprecated.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -fms-extensions -Wno-deprecated %s
+// RUN: %clang_cc1 -std=c++11 -verify -fsyntax-only -fms-extensions -Wno-deprecated -Wc++14-extensions %s
 
 // NOTE: use -Wno-deprecated to avoid cluttering the output with deprecated
 // warnings
diff --git a/test/SemaCXX/illegal-member-initialization.cpp b/test/SemaCXX/illegal-member-initialization.cpp
index 87069efaacc74..17faed7eff7e7 100644
--- a/test/SemaCXX/illegal-member-initialization.cpp
+++ b/test/SemaCXX/illegal-member-initialization.cpp
@@ -7,6 +7,7 @@ struct A {
 };
 
 struct B {
+  int field;
 };
 
 struct X {
diff --git a/test/SemaCXX/inline.cpp b/test/SemaCXX/inline.cpp
index e569300faf771..b20bc18d0a3fe 100644
--- a/test/SemaCXX/inline.cpp
+++ b/test/SemaCXX/inline.cpp
@@ -1,5 +1,18 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z %s -Wc++98-c++11-c++14-compat
 
 // Check that we don't allow illegal uses of inline
 // (checking C++-only constructs here)
 struct c {inline int a;}; // expected-error{{'inline' can only appear on functions}}
+
+void localVar() {
+  inline int a; // expected-error{{inline declaration of 'a' not allowed in block scope}}
+}
+
+// Check that we warn appropriately.
+#if __cplusplus <= 201402L
+inline int a; // expected-warning{{inline variables are a C++1z extension}}
+#else
+inline int a; // expected-warning{{inline variables are incompatible with C++ standards before C++1z}}
+#endif
diff --git a/test/SemaCXX/integer-overflow.cpp b/test/SemaCXX/integer-overflow.cpp
index 566bb05fa0cb1..a119f0eabe3a4 100644
--- a/test/SemaCXX/integer-overflow.cpp
+++ b/test/SemaCXX/integer-overflow.cpp
@@ -1,6 +1,11 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=gnu++98 -triple x86_64-pc-linux-gnu
 typedef unsigned long long uint64_t;
-typedef unsigned long long uint32_t;
+typedef unsigned int uint32_t;
+
+// Check integer sizes.
+int array64[sizeof(uint64_t) == 8 ? 1 : -1];
+int array32[sizeof(uint32_t) == 4 ? 1 : -1];
+int arrayint[sizeof(int) < sizeof(uint64_t) ? 1 : -1];
 
 uint64_t f0(uint64_t);
 uint64_t f1(uint64_t, uint32_t);
diff --git a/test/SemaCXX/lambda-expressions.cpp b/test/SemaCXX/lambda-expressions.cpp
index 72adcdbce2fef..17808cef363a0 100644
--- a/test/SemaCXX/lambda-expressions.cpp
+++ b/test/SemaCXX/lambda-expressions.cpp
@@ -476,3 +476,26 @@ int main() {
 
 A<int> a;
 }
+
+// rdar://22032373
+namespace rdar22032373 {
+void foo() {
+  auto blk = [](bool b) {
+    if (b)
+      return undeclared_error; // expected-error {{use of undeclared identifier}}
+    return 0;
+  };
+}
+}
+
+namespace nested_lambda {
+template <int N>
+class S {};
+
+void foo() {
+  const int num = 18;  // expected-note {{'num' declared here}}
+  auto outer = []() {
+    auto inner = [](S<num> &X) {};  // expected-error {{variable 'num' cannot be implicitly captured in a lambda with no capture-default specified}}
+  };
+}
+}
diff --git a/test/SemaCXX/literal-operators.cpp b/test/SemaCXX/literal-operators.cpp
index ba571788b962e..304aa7cab7f30 100644
--- a/test/SemaCXX/literal-operators.cpp
+++ b/test/SemaCXX/literal-operators.cpp
@@ -35,13 +35,14 @@ typedef const char c;
 void operator "" _good (c*);
 
 // Check extra cv-qualifiers
-void operator "" _cv_good (volatile const char *, const size_t); // expected-error {{parameter declaration for literal operator 'operator""_cv_good' is not valid}}
+void operator "" _cv_good (volatile const char *, const size_t); // expected-error {{invalid literal operator parameter type 'const volatile char *', did you mean 'const char *'?}}
 
 // Template declaration
 template <char...> void operator "" _good ();
 
-// FIXME: Test some invalid decls that might crop up.
-template <typename...> void operator "" _invalid(); // expected-error {{parameter declaration for literal operator 'operator""_invalid' is not valid}}
+template <typename...> void operator "" _invalid(); // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <wchar_t...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
+template <unsigned long long...> void operator "" _invalid();  // expected-error {{template parameter list for literal operator must be either 'char...' or 'typename T, T...'}}
 
 _Complex float operator""if(long double); // expected-warning {{reserved}}
 _Complex float test_if_1() { return 2.0f + 1.5if; };
diff --git a/test/SemaCXX/make_integer_seq.cpp b/test/SemaCXX/make_integer_seq.cpp
index 4e15414cbe67c..a9b8d2b23cb50 100644
--- a/test/SemaCXX/make_integer_seq.cpp
+++ b/test/SemaCXX/make_integer_seq.cpp
@@ -47,3 +47,7 @@ using illformed2 = ErrorSeq<int, -5>;
 
 template <typename T, T N> void f() {}
 __make_integer_seq<f, int, 0> x; // expected-error{{template template parameter must be a class template or type alias template}}
+
+__make_integer_seq<__make_integer_seq, int, 10> PR28494; // expected-error{{different template parameters}}
+// expected-note@make_integer_seq.cpp:* {{template parameter has a different kind}}
+// expected-note@make_integer_seq.cpp:* {{previous template template parameter is here}}
diff --git a/test/SemaCXX/member-init.cpp b/test/SemaCXX/member-init.cpp
index b3ee30b456ec1..65c8873117ab5 100644
--- a/test/SemaCXX/member-init.cpp
+++ b/test/SemaCXX/member-init.cpp
@@ -192,3 +192,13 @@ struct S {
   int x[3] = {[N] = 3};
 };
 }
+
+namespace PR28060 {
+template <class T>
+void foo(T v) {
+  struct s {
+    T *s = 0;
+  };
+}
+template void foo(int);
+}
diff --git a/test/SemaCXX/ms-const-member-expr.cpp b/test/SemaCXX/ms-const-member-expr.cpp
new file mode 100644
index 0000000000000..72cfe76fbe43a
--- /dev/null
+++ b/test/SemaCXX/ms-const-member-expr.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 %s -std=c++11 -fms-compatibility -fsyntax-only -verify
+
+struct S {
+  enum { E = 1 };
+  static const int sdm = 1;
+};
+
+void f(S *s) {
+  char array[s->E] = { 0 };
+}
+
+extern S *s;
+constexpr int e1 = s->E;
+
+S *side_effect();  // expected-note{{declared here}}
+constexpr int e2 = // expected-error{{must be initialized by a constant expression}}
+    side_effect()->E; // expected-note{{cannot be used in a constant expression}}
+
+constexpr int e4 = s->sdm;
diff --git a/test/SemaCXX/ms-empty_bases.cpp b/test/SemaCXX/ms-empty_bases.cpp
new file mode 100644
index 0000000000000..69d9e2799b8ff
--- /dev/null
+++ b/test/SemaCXX/ms-empty_bases.cpp
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -fsyntax-only -verify -fms-extensions -Wno-microsoft -std=c++11
+
+struct __declspec(empty_bases) S {};
+enum __declspec(empty_bases) E {}; // expected-warning{{'empty_bases' attribute only applies to classes}}
+int __declspec(empty_bases) I; // expected-warning{{'empty_bases' attribute only applies to classes}}
+typedef struct T __declspec(empty_bases) U; // expected-warning{{'empty_bases' attribute only applies to classes}}
+auto z = []() __declspec(empty_bases) { return nullptr; }; // expected-warning{{'empty_bases' attribute only applies to classes}}
+
+struct __declspec(empty_bases(1)) X {}; // expected-error{{'empty_bases' attribute takes no arguments}}
diff --git a/test/SemaCXX/ms-exception-spec.cpp b/test/SemaCXX/ms-exception-spec.cpp
index 1be8ec2936908..07633791b9f31 100644
--- a/test/SemaCXX/ms-exception-spec.cpp
+++ b/test/SemaCXX/ms-exception-spec.cpp
@@ -1,4 +1,9 @@
-// RUN: %clang_cc1 %s -fsyntax-only -verify -fms-extensions
-// expected-no-diagnostics
+// RUN: %clang_cc1 %s -fsyntax-only -verify -fms-extensions -fexceptions -fcxx-exceptions
 
 void f() throw(...) { }
+
+namespace PR28080 {
+struct S;           // expected-note {{forward declaration}}
+void fn() throw(S); // expected-warning {{incomplete type}} expected-note{{previous declaration}}
+void fn() throw();  // expected-warning {{does not match previous declaration}}
+}
diff --git a/test/SemaCXX/ms-layout_version.cpp b/test/SemaCXX/ms-layout_version.cpp
new file mode 100644
index 0000000000000..7f83b2d4473e0
--- /dev/null
+++ b/test/SemaCXX/ms-layout_version.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -triple i386-pc-win32 %s -fsyntax-only -verify -fms-extensions -Wno-microsoft -std=c++11
+
+struct __declspec(layout_version(19)) S {};
+enum __declspec(layout_version(19)) E {}; // expected-warning{{'layout_version' attribute only applies to classes}}
+int __declspec(layout_version(19)) I; // expected-warning{{'layout_version' attribute only applies to classes}}
+typedef struct T __declspec(layout_version(19)) U; // expected-warning{{'layout_version' attribute only applies to classes}}
+auto z = []() __declspec(layout_version(19)) { return nullptr; }; // expected-warning{{'layout_version' attribute only applies to classes}}
+
+struct __declspec(layout_version(18)) X {}; // expected-error{{'layout_version' attribute parameter 18 is out of bounds}}
+struct __declspec(layout_version(20)) Y {}; // expected-error{{'layout_version' attribute parameter 20 is out of bounds}}
+struct __declspec(layout_version) Z {}; // expected-error{{attribute takes one argument}}
diff --git a/test/SemaCXX/new-delete.cpp b/test/SemaCXX/new-delete.cpp
index 7bc724b21010f..e96603d69e101 100644
--- a/test/SemaCXX/new-delete.cpp
+++ b/test/SemaCXX/new-delete.cpp
@@ -444,11 +444,11 @@ namespace r150682 {
 
   template<typename X>
   void tfn() {
-    new (*(PlacementArg*)0) T[1];
+    new (*(PlacementArg*)0) T[1]; // expected-warning 2 {{binding dereferenced null pointer to reference has undefined behavior}}
   }
 
   void fn() {
-    tfn<int>();
+    tfn<int>();  // expected-note {{in instantiation of function template specialization 'r150682::tfn<int>' requested here}}
   }
 
 }
diff --git a/test/SemaCXX/no-wchar.cpp b/test/SemaCXX/no-wchar.cpp
index 291b657f51abe..b6dcddf1f428b 100644
--- a/test/SemaCXX/no-wchar.cpp
+++ b/test/SemaCXX/no-wchar.cpp
@@ -7,3 +7,24 @@ void foo(const wchar_t* x);
 void bar() {
   foo(L"wide string literal");
 }
+
+void foo1(wchar_t * t = L"");
+// expected-warning@-1 {{conversion from string literal to 'wchar_t *' (aka 'unsigned short *') is deprecated}}
+
+short *a = L"";
+// expected-error@-1 {{cannot initialize a variable of type 'short *' with an lvalue of type 'const unsigned short [1]'}}
+char *b = L"";
+// expected-error@-1 {{cannot initialize a variable of type 'char *' with an lvalue of type 'const unsigned short [1]'}}
+
+// NOTE: MSVC allows deprecated conversion in conditional expression if at least
+// one of the operand is a string literal but Clang doesn't allow it.
+wchar_t *c = true ? L"a" : L"";
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with}}
+
+const wchar_t *d1 = 0;
+const wchar_t *d2 = 0;
+wchar_t *d = true ? d1 : d2;
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with}}
+
+wchar_t* e = (const wchar_t*)L"";
+// expected-error@-1 {{cannot initialize a variable of type 'wchar_t *' (aka 'unsigned short *') with an rvalue of type 'const wchar_t *' (aka 'const unsigned short *')}}
diff --git a/test/SemaCXX/overload-call.cpp b/test/SemaCXX/overload-call.cpp
index 3d286a94a0453..7eaf98b601c18 100644
--- a/test/SemaCXX/overload-call.cpp
+++ b/test/SemaCXX/overload-call.cpp
@@ -375,16 +375,24 @@ namespace test2 {
 }
 
 // PR 6117
-namespace test3 {
-  struct Base {};
+namespace IncompleteConversion {
+  struct Complete {};
   struct Incomplete;
 
-  void foo(Base *); // expected-note 2 {{cannot convert argument of incomplete type}}
-  void foo(Base &); // expected-note 2 {{cannot convert argument of incomplete type}}
-
-  void test(Incomplete *P) {
-    foo(P); // expected-error {{no matching function for call to 'foo'}}
-    foo(*P); // expected-error {{no matching function for call to 'foo'}}
+  void completeFunction(Complete *); // expected-note 2 {{cannot convert argument of incomplete type}}
+  void completeFunction(Complete &); // expected-note 2 {{cannot convert argument of incomplete type}}
+  
+  void testTypeConversion(Incomplete *P) {
+    completeFunction(P); // expected-error {{no matching function for call to 'completeFunction'}}
+    completeFunction(*P); // expected-error {{no matching function for call to 'completeFunction'}}
+  }
+  
+  void incompletePointerFunction(Incomplete *); // expected-note {{candidate function not viable: cannot convert argument of incomplete type 'IncompleteConversion::Incomplete' to 'IncompleteConversion::Incomplete *' for 1st argument; take the address of the argument with &}}
+  void incompleteReferenceFunction(Incomplete &); // expected-note {{candidate function not viable: cannot convert argument of incomplete type 'IncompleteConversion::Incomplete *' to 'IncompleteConversion::Incomplete &' for 1st argument; dereference the argument with *}}
+  
+  void testPointerReferenceConversion(Incomplete &reference, Incomplete *pointer) {
+    incompletePointerFunction(reference); // expected-error {{no matching function for call to 'incompletePointerFunction'}}
+    incompleteReferenceFunction(pointer); // expected-error {{no matching function for call to 'incompleteReferenceFunction'}}
   }
 }
 
diff --git a/test/SemaCXX/overloaded-builtin-operators.cpp b/test/SemaCXX/overloaded-builtin-operators.cpp
index 4c2953b0b3fe1..7a99a898e2092 100644
--- a/test/SemaCXX/overloaded-builtin-operators.cpp
+++ b/test/SemaCXX/overloaded-builtin-operators.cpp
@@ -183,7 +183,7 @@ void test_dr425(A a) {
   // FIXME: lots of candidates here!
   (void)(1.0f * a); // expected-error{{ambiguous}} \
                     // expected-note 4{{candidate}} \
-                    // expected-note {{remaining 117 candidates omitted; pass -fshow-overloads=all to show them}}
+                    // expected-note {{remaining 140 candidates omitted; pass -fshow-overloads=all to show them}}
 }
 
 // pr5432
diff --git a/test/SemaCXX/pr25181-crash-on-invalid.cpp b/test/SemaCXX/pr25181-crash-on-invalid.cpp
new file mode 100644
index 0000000000000..41178c95e8ad4
--- /dev/null
+++ b/test/SemaCXX/pr25181-crash-on-invalid.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+// Don't crash (PR25181).
+
+template <typename T> class Foo { // expected-note {{template parameter is declared here}}
+  template <typename T> // expected-error {{declaration of 'T' shadows template parameter}}
+  void Foo<T>::method(T *) const throw() {} // expected-error {{nested name specifier 'Foo<T>::' for declaration does not refer into a class, class template or class template partial specialization}}
+};
diff --git a/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp b/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp
new file mode 100644
index 0000000000000..772db9935c078
--- /dev/null
+++ b/test/SemaCXX/pr27047-default-init-expr-name-conflict.cpp
@@ -0,0 +1,8 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++11 %s
+
+template <typename T>
+struct A {
+  // Used to crash when field was named after class.
+  int A = 0;
+};
+A<int> a;
diff --git a/test/SemaCXX/pragma-optimize.cpp b/test/SemaCXX/pragma-optimize.cpp
index 48a15460fc96c..cda46c552a4e5 100644
--- a/test/SemaCXX/pragma-optimize.cpp
+++ b/test/SemaCXX/pragma-optimize.cpp
@@ -151,14 +151,14 @@ int yet_another_normal(int x) {
 // CHECK-DAG: attributes [[ATTRYETANOTHEROPTNONE]] = { {{.*}}noinline{{.*}}optnone{{.*}} }
 
 // Check that the other functions do NOT have optnone.
-// CHECK-DAG-NOT: attributes [[ATTRFOO]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRBAZ]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRBAX]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRWOMBAT]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRTWICE]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER2]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRCONTAINER3]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRTHRICEINT]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
-// CHECK-DAG-NOT: attributes [[ATTRYETANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRFOO]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRBAZ]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRBAX]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRWOMBAT]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRTWICE]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER2]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRCONTAINER3]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRTHRICEINT]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
+// CHECK-NOT: attributes [[ATTRYETANOTHERNORMAL]] = { {{.*}}optnone{{.*}} }
diff --git a/test/SemaCXX/pragma-vtordisp.cpp b/test/SemaCXX/pragma-vtordisp.cpp
index 649c0ee9e6868..1421c33db5d36 100644
--- a/test/SemaCXX/pragma-vtordisp.cpp
+++ b/test/SemaCXX/pragma-vtordisp.cpp
@@ -22,7 +22,8 @@ struct B : virtual A { int b; };
 
 // Test a reset.
 #pragma vtordisp()
-#pragma vtordisp(pop) // expected-warning {{#pragma vtordisp(pop, ...) failed: stack empty}}
+#pragma vtordisp(pop) // stack should NOT be affected by reset.
+                      // Now stack contains '1'.
 
 #pragma vtordisp(      // expected-warning {{unknown action for '#pragma vtordisp' - ignored}}
 #pragma vtordisp(asdf) // expected-warning {{unknown action for '#pragma vtordisp' - ignored}}
@@ -42,6 +43,7 @@ struct E {
   virtual void f();
 };
 
+#pragma vtordisp(pop) // After this stack should be empty.
 #pragma vtordisp(pop) // expected-warning {{#pragma vtordisp(pop, ...) failed: stack empty}}
 
 void g() {
diff --git a/test/SemaCXX/qual-id-test.cpp b/test/SemaCXX/qual-id-test.cpp
index 9994d75cac19d..61e60ae82dfa3 100644
--- a/test/SemaCXX/qual-id-test.cpp
+++ b/test/SemaCXX/qual-id-test.cpp
@@ -1,9 +1,15 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 namespace A
 {
     namespace B
     {
-        struct base // expected-note{{object type}}
+        struct base
+#if __cplusplus <= 199711L
+        // expected-note@-2 {{lookup in the object type 'A::sub' refers here}}
+#endif
         {
             void x() {}
             void y() {}
@@ -85,8 +91,14 @@ namespace C
     void fun4a() {
       A::sub *a;
       
-      typedef A::member base; // expected-note{{current scope}}
-      a->base::x(); // expected-error{{ambiguous}}      
+      typedef A::member base;
+#if __cplusplus <= 199711L
+      // expected-note@-2 {{lookup from the current scope refers here}}
+#endif
+      a->base::x();
+#if __cplusplus <= 199711L
+      // expected-error@-2 {{lookup of 'base' in member access expression is ambiguous}}
+#endif
     }
 
     void fun4b() {
diff --git a/test/SemaCXX/return-stack-addr-2.cpp b/test/SemaCXX/return-stack-addr-2.cpp
new file mode 100644
index 0000000000000..ad27567fcd646
--- /dev/null
+++ b/test/SemaCXX/return-stack-addr-2.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -std=c++11 %s
+// expected-no-diagnostics
+
+namespace PR26599 {
+template <typename>
+struct S;
+
+struct I {};
+
+template <typename T>
+void *&non_pointer() {
+  void *&r = S<T>()[I{}];
+  return r;
+}
+
+template <typename T>
+void *&pointer() {
+  void *&r = S<T>()[nullptr];
+  return r;
+}
+}
+
diff --git a/test/SemaCXX/return.cpp b/test/SemaCXX/return.cpp
index 8c1664516a710..db289240d1ce6 100644
--- a/test/SemaCXX/return.cpp
+++ b/test/SemaCXX/return.cpp
@@ -118,5 +118,5 @@ void cxx_unresolved_expr() {
   // CXXUnresolvedConstructExpr, and the missing ')' gives it an invalid source
   // location for its rparen.  Check that emitting a diag on the range of the
   // expr doesn't assert.
-  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{void function 'cxx_unresolved_expr' should not return a value}} expected-error {{use of undeclared identifier 'undeclared'}}
+  return int(undeclared, 4; // expected-error {{expected ')'}} expected-note{{to match this '('}} expected-error {{use of undeclared identifier 'undeclared'}}
 }
diff --git a/test/SemaCXX/rval-references.cpp b/test/SemaCXX/rval-references.cpp
index 9c79ad7b0b97c..4c2050494b696 100644
--- a/test/SemaCXX/rval-references.cpp
+++ b/test/SemaCXX/rval-references.cpp
@@ -72,23 +72,17 @@ int&& should_not_warn(int&& i) { // But GCC 4.4 does
 // Test the return dance. This also tests IsReturnCopyElidable.
 struct MoveOnly {
   MoveOnly();
-  MoveOnly(const MoveOnly&) = delete;	// expected-note {{candidate constructor}} \
-  // expected-note 3{{explicitly marked deleted here}}
-  MoveOnly(MoveOnly&&);	// expected-note {{candidate constructor}}
-  MoveOnly(int&&);	// expected-note {{candidate constructor}}
+  MoveOnly(const MoveOnly&) = delete;	// expected-note 3{{explicitly marked deleted here}}
 };
 
 MoveOnly gmo;
 MoveOnly returningNonEligible() {
-  int i;
   static MoveOnly mo;
   MoveOnly &r = mo;
   if (0) // Copy from global can't be elided
     return gmo; // expected-error {{call to deleted constructor}}
   else if (0) // Copy from local static can't be elided
     return mo; // expected-error {{call to deleted constructor}}
-  else if (0) // Copy from reference can't be elided
+  else // Copy from reference can't be elided
     return r; // expected-error {{call to deleted constructor}}
-  else // Construction from different type can't be elided
-    return i; // expected-error {{no viable conversion from returned value of type 'int' to function return type 'MoveOnly'}}
 }
diff --git a/test/SemaCXX/static-cast.cpp b/test/SemaCXX/static-cast.cpp
index b3fe49a88c6bf..ff47c0bb4dce1 100644
--- a/test/SemaCXX/static-cast.cpp
+++ b/test/SemaCXX/static-cast.cpp
@@ -43,11 +43,11 @@ void t_529_2()
   (void)static_cast<void*>((int*)0);
   (void)static_cast<volatile const void*>((const int*)0);
   (void)static_cast<A*>((B*)0);
-  (void)static_cast<A&>(*((B*)0));
+  (void)static_cast<A&>(*((B*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<const B*>((C1*)0);
-  (void)static_cast<B&>(*((C1*)0));
+  (void)static_cast<B&>(*((C1*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<A*>((D*)0);
-  (void)static_cast<const A&>(*((D*)0));
+  (void)static_cast<const A&>(*((D*)0)); // expected-warning {{binding dereferenced null pointer to reference has undefined behavior}}
   (void)static_cast<int B::*>((int A::*)0);
   (void)static_cast<void (B::*)()>((void (A::*)())0);
 
diff --git a/test/SemaCXX/switch-implicit-fallthrough-macro.cpp b/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
index add212fcf5d85..11df2cbfb53f0 100644
--- a/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough-macro.cpp
@@ -1,4 +1,8 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCLANG_PREFIX -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] -DUNCHOSEN=[[fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCLANG_PREFIX -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[clang::fallthrough]] %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++1z -Wimplicit-fallthrough -DCOMMAND_LINE_FALLTHROUGH=[[fallthrough]] -DUNCHOSEN=[[clang::fallthrough]] %s
 
 int fallthrough_compatibility_macro_from_command_line(int n) {
   switch (n) {
@@ -10,15 +14,12 @@ int fallthrough_compatibility_macro_from_command_line(int n) {
   return n;
 }
 
-#ifdef __clang__
-#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#ifdef CLANG_PREFIX
 #define COMPATIBILITY_FALLTHROUGH   [ [ /* test */  clang /* test */ \
     ::  fallthrough  ]  ]    // testing whitespace and comments in macro definition
-#endif
-#endif
-
-#ifndef COMPATIBILITY_FALLTHROUGH
-#define COMPATIBILITY_FALLTHROUGH do { } while (0)
+#else
+#define COMPATIBILITY_FALLTHROUGH   [ [ /* test */  /* test */ \
+    fallthrough  ]  ]    // testing whitespace and comments in macro definition
 #endif
 
 int fallthrough_compatibility_macro_from_source(int n) {
@@ -32,7 +33,11 @@ int fallthrough_compatibility_macro_from_source(int n) {
 }
 
 // Deeper macro substitution
+#ifdef CLANG_PREFIX
 #define M1 [[clang::fallthrough]]
+#else
+#define M1 [[fallthrough]]
+#endif
 #ifdef __clang__
 #define M2 M1
 #else
@@ -59,12 +64,17 @@ int fallthrough_compatibility_macro_in_macro(int n) {
 #undef M2
 #undef COMPATIBILITY_FALLTHROUGH
 #undef COMMAND_LINE_FALLTHROUGH
+#undef UNCHOSEN
 
 int fallthrough_compatibility_macro_undefined(int n) {
   switch (n) {
     case 0:
       n = n * 20;
+#if __cplusplus <= 201402L
     case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#else
+    case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#endif
       ;
   }
 #define TOO_LATE [[clang::fallthrough]]
@@ -83,7 +93,11 @@ int fallthrough_compatibility_macro_history(int n) {
     case 0:
       n = n * 20;
 #undef MACRO_WITH_HISTORY
+#if __cplusplus <= 201402L
     case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#else
+    case 1: // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
+#endif
       ;
 #define MACRO_WITH_HISTORY [[clang::fallthrough]]
   }
diff --git a/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp b/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp
new file mode 100644
index 0000000000000..6ab6370069afe
--- /dev/null
+++ b/test/SemaCXX/switch-implicit-fallthrough-off-by-default.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -DUNREACHABLE=1 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -DUNREACHABLE=0 -Wimplicit-fallthrough %s
+
+void fallthrough(int n) {
+  switch (n) {
+  case 1:
+    if (UNREACHABLE)
+      return;
+    [[fallthrough]]; // expected-no-diagnostics, only checked when UNREACHABLE=0
+  case 2:
+    break;
+  }
+}
diff --git a/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp b/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
index 009c8180b1bb8..6880bdd5f47d5 100644
--- a/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough-per-method.cpp
@@ -41,9 +41,8 @@ int fallthrough2(int n) {
 void unscoped(int n) {
   switch (n % 2) {
     case 0:
-      // FIXME: This should be typo-corrected, probably.
-      [[fallthrough]]; // expected-warning{{unknown attribute 'fallthrough' ignored}}
-    case 2: // expected-warning{{unannotated fall-through}} expected-note{{clang::fallthrough}} expected-note{{break;}}
+      [[fallthrough]];
+    case 2:
       [[clang::fallthrough]];
     case 1:
       break;
diff --git a/test/SemaCXX/switch-implicit-fallthrough.cpp b/test/SemaCXX/switch-implicit-fallthrough.cpp
index 0bc43cdbd45b2..9540b1ff28808 100644
--- a/test/SemaCXX/switch-implicit-fallthrough.cpp
+++ b/test/SemaCXX/switch-implicit-fallthrough.cpp
@@ -179,18 +179,15 @@ void fallthrough_cfgblock_with_null_successor(int x) {
 
 int fallthrough_position(int n) {
   switch (n) {
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       n += 300;
       [[clang::fallthrough]];  // expected-warning{{fallthrough annotation in unreachable code}}
     case 221:
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       return 1;
       [[clang::fallthrough]];  // expected-warning{{fallthrough annotation in unreachable code}}
     case 222:
-      [[clang::fallthrough]];  // expected-warning{{fallthrough annotation does not directly precede switch label}}
       n += 400;
     case 223:          // expected-warning{{unannotated fall-through between switch labels}} expected-note{{insert '[[clang::fallthrough]];' to silence this warning}} expected-note{{insert 'break;' to avoid fall-through}}
-      [[clang::fallthrough]]; // expected-warning{{fallthrough annotation does not directly precede switch label}}
+      ;
   }
 
   long p = static_cast<long>(n) * n;
@@ -282,6 +279,23 @@ namespace PR18983 {
   }
 }
 
+int fallthrough_placement_error(int n) {
+  switch (n) {
+      [[clang::fallthrough]]; // expected-warning{{fallthrough annotation in unreachable code}}
+      n += 300;
+    case 221:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+      return 1;
+    case 222:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+      n += 400;
+      [[clang::fallthrough]];
+    case 223:
+      [[clang::fallthrough]]; // expected-error{{fallthrough annotation does not directly precede switch label}}
+  }
+  return n;
+}
+
 int fallthrough_targets(int n) {
   [[clang::fallthrough]]; // expected-error{{fallthrough annotation is outside switch statement}}
 
diff --git a/test/SemaCXX/type-convert-construct.cpp b/test/SemaCXX/type-convert-construct.cpp
index 2dec50abebf28..7ae83638adb5d 100644
--- a/test/SemaCXX/type-convert-construct.cpp
+++ b/test/SemaCXX/type-convert-construct.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++98 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 %s 
 
 void f() {
   float v1 = float(1);
@@ -12,8 +14,21 @@ void f() {
   typedef int T;
   int *p;
   bool v6 = T(0) == p;
+#if __cplusplus >= 201103L
+  // expected-error@-2 {{comparison between pointer and integer ('T' (aka 'int') and 'int *')}}
+#endif
   char *str;
-  str = "a string"; // expected-warning{{conversion from string literal to 'char *' is deprecated}}
+  str = "a string";
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{conversion from string literal to 'char *' is deprecated}}
+#else
+  // expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'char *'}}
+#endif
   wchar_t *wstr;
-  wstr = L"a wide string"; // expected-warning{{conversion from string literal to 'wchar_t *' is deprecated}}
+  wstr = L"a wide string";
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{conversion from string literal to 'wchar_t *' is deprecated}}
+#else
+  // expected-warning@-4 {{ISO C++11 does not allow conversion from string literal to 'wchar_t *'}}
+#endif
 }
diff --git a/test/SemaCXX/type-traits.cpp b/test/SemaCXX/type-traits.cpp
index 69760fd6bd068..c53b02774aca9 100644
--- a/test/SemaCXX/type-traits.cpp
+++ b/test/SemaCXX/type-traits.cpp
@@ -1514,6 +1514,9 @@ void has_nothrow_move_assign() {
 
   { int arr[T(__is_nothrow_assignable(HasNoThrowMoveAssign, HasNoThrowMoveAssign))]; }
   { int arr[F(__is_nothrow_assignable(HasThrowMoveAssign, HasThrowMoveAssign))]; }
+
+  { int arr[T(__is_assignable(HasNoThrowMoveAssign, HasNoThrowMoveAssign))]; }
+  { int arr[T(__is_assignable(HasThrowMoveAssign, HasThrowMoveAssign))]; }
 }
 
 void has_trivial_move_assign() {
@@ -1974,6 +1977,46 @@ void trivial_checks()
                                          TrivialMoveButNotCopy)))]; }
   { int arr[T((__is_trivially_assignable(TrivialMoveButNotCopy&,
                                          TrivialMoveButNotCopy&&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, int&&)))]; }
+  { int arr[T((__is_trivially_assignable(int&, const int&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, POD&&)))]; }
+  { int arr[T((__is_trivially_assignable(POD&, const POD&)))]; }
+  { int arr[T((__is_trivially_assignable(int*&, int*)))]; }
+  { int arr[T((__is_trivially_assignable(AllDefaulted,
+                                         const AllDefaulted &)))]; }
+  { int arr[T((__is_trivially_assignable(AllDefaulted,
+                                         AllDefaulted &&)))]; }
+
+  { int arr[F((__is_assignable(int *&, float *)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, const HasCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasCopyAssign &, HasCopyAssign &&)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy &)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               const TrivialMoveButNotCopy &)))]; }
+  { int arr[F((__is_assignable(AllDeleted,
+                               const AllDeleted &)))]; }
+  { int arr[F((__is_assignable(AllDeleted,
+                               AllDeleted &&)))]; }
+  { int arr[T((__is_assignable(ExtDefaulted,
+                               const ExtDefaulted &)))]; }
+  { int arr[T((__is_assignable(ExtDefaulted,
+                               ExtDefaulted &&)))]; }
+
+  { int arr[T((__is_assignable(HasDefaultTrivialCopyAssign &,
+                               HasDefaultTrivialCopyAssign &)))]; }
+  { int arr[T((__is_assignable(HasDefaultTrivialCopyAssign &,
+                               const HasDefaultTrivialCopyAssign &)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy)))]; }
+  { int arr[T((__is_assignable(TrivialMoveButNotCopy &,
+                               TrivialMoveButNotCopy &&)))]; }
 }
 
 void constructible_checks() {
diff --git a/test/SemaCXX/type_pack_element.cpp b/test/SemaCXX/type_pack_element.cpp
new file mode 100644
index 0000000000000..d22d5fa2ba67c
--- /dev/null
+++ b/test/SemaCXX/type_pack_element.cpp
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+static_assert(__has_builtin(__type_pack_element), "");
+
+using SizeT = decltype(sizeof(int));
+
+template <SizeT i, typename ...T>
+using TypePackElement = __type_pack_element<i, T...>;
+
+template <int i>
+struct X;
+
+static_assert(__is_same(TypePackElement<0, X<0>>, X<0>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>>, X<1>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>>, X<2>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>>, X<3>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>, X<4>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>, X<4>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>, X<4>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>, X<4>>, X<3>), "");
+static_assert(__is_same(TypePackElement<4, X<0>, X<1>, X<2>, X<3>, X<4>>, X<4>), "");
+
+static_assert(__is_same(TypePackElement<0, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<0>), "");
+static_assert(__is_same(TypePackElement<1, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<1>), "");
+static_assert(__is_same(TypePackElement<2, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<2>), "");
+static_assert(__is_same(TypePackElement<3, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<3>), "");
+static_assert(__is_same(TypePackElement<4, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<4>), "");
+static_assert(__is_same(TypePackElement<5, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<5>), "");
+
+// Test __type_pack_element with more than 2 top-level template arguments.
+static_assert(__is_same(__type_pack_element<5, X<0>, X<1>, X<2>, X<3>, X<4>, X<5>>, X<5>), "");
+
+template <SizeT Index, typename ...T>
+using ErrorTypePackElement1 = __type_pack_element<Index, T...>; // expected-error{{may not be accessed at an out of bounds index}}
+using illformed1 = ErrorTypePackElement1<3, X<0>, X<1>>;  // expected-note{{in instantiation}}
diff --git a/test/SemaCXX/typo-correction-crash.cpp b/test/SemaCXX/typo-correction-crash.cpp
new file mode 100644
index 0000000000000..0b8383dbafef4
--- /dev/null
+++ b/test/SemaCXX/typo-correction-crash.cpp
@@ -0,0 +1,21 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++14 -verify %s
+auto check1() {
+  return 1;
+  return s; // expected-error {{use of undeclared identifier 's'}}
+}
+
+int test = 11; // expected-note {{'test' declared here}}
+auto check2() {
+  return "s";
+  return tes; // expected-error {{use of undeclared identifier 'tes'; did you mean 'test'?}}
+}
+
+namespace BarNamespace {
+namespace NestedNamespace { // expected-note {{'BarNamespace::NestedNamespace' declared here}}
+typedef int type;
+}
+}
+struct FooRecord { };
+FooRecord::NestedNamespace::type x; // expected-error {{no member named 'NestedNamespace' in 'FooRecord'; did you mean 'BarNamespace::NestedNamespace'?}}
+
+void cast_expr(int g) { +int(n)(g); } // expected-error {{undeclared identifier 'n'}}
diff --git a/test/SemaCXX/typo-correction.cpp b/test/SemaCXX/typo-correction.cpp
index 07c1634431ea1..48597ded1528a 100644
--- a/test/SemaCXX/typo-correction.cpp
+++ b/test/SemaCXX/typo-correction.cpp
@@ -663,3 +663,19 @@ class Bar : public A::B::Foofoo {};
 
 using C::D::Foofoo;  // expected-error {{no member named 'Foofoo' in namespace 'PR24781_using_crash::C::D'; did you mean 'A::B::Foofoo'?}}
 }
+
+int d = ? L : d; // expected-error {{expected expression}} expected-error {{undeclared identifier}}
+
+struct B0 {
+  int : 0 |         // expected-error {{invalid operands to binary expression}}
+      (struct B0)e; // expected-error {{use of undeclared identifier}}
+};
+
+namespace {
+struct a0is0 {};
+struct b0is0 {};
+int g() {
+  0 [                 // expected-error {{subscripted value is not an array}}
+      sizeof(c0is0)]; // expected-error {{use of undeclared identifier}}
+};
+}
diff --git a/test/SemaCXX/unaddressable-functions.cpp b/test/SemaCXX/unaddressable-functions.cpp
new file mode 100644
index 0000000000000..286cbee5ea9bc
--- /dev/null
+++ b/test/SemaCXX/unaddressable-functions.cpp
@@ -0,0 +1,147 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++14
+
+namespace access_control {
+class Private {
+  void check(int *) __attribute__((enable_if(false, "")));
+  void check(double *) __attribute__((enable_if(true, "")));
+
+  static void checkStatic(int *) __attribute__((enable_if(false, "")));
+  static void checkStatic(double *) __attribute__((enable_if(true, "")));
+};
+
+auto Priv = reinterpret_cast<void (Private::*)(char *)>(&Private::check); // expected-error{{'check' is a private member of 'access_control::Private'}} expected-note@6{{implicitly declared private here}}
+
+auto PrivStatic = reinterpret_cast<void (*)(char *)>(&Private::checkStatic); // expected-error{{'checkStatic' is a private member of 'access_control::Private'}} expected-note@9{{implicitly declared private here}}
+
+class Protected {
+protected:
+  void check(int *) __attribute__((enable_if(false, "")));
+  void check(double *) __attribute__((enable_if(true, "")));
+
+  static void checkStatic(int *) __attribute__((enable_if(false, "")));
+  static void checkStatic(double *) __attribute__((enable_if(true, "")));
+};
+
+auto Prot = reinterpret_cast<void (Protected::*)(char *)>(&Protected::check); // expected-error{{'check' is a protected member of 'access_control::Protected'}} expected-note@19{{declared protected here}}
+
+auto ProtStatic = reinterpret_cast<void (*)(char *)>(&Protected::checkStatic); // expected-error{{'checkStatic' is a protected member of 'access_control::Protected'}} expected-note@22{{declared protected here}}
+}
+
+namespace unavailable {
+// Ensure that we check that the function can be called
+void foo() __attribute__((unavailable("don't call this")));
+void foo(int) __attribute__((enable_if(false, "")));
+
+void *Ptr = reinterpret_cast<void*>(foo); // expected-error{{'foo' is unavailable: don't call this}} expected-note@-3{{explicitly marked unavailable here}}
+}
+
+namespace template_deduction {
+void foo() __attribute__((enable_if(false, "")));
+
+void bar() __attribute__((enable_if(true, "")));
+void bar() __attribute__((enable_if(false, "")));
+
+void baz(int a) __attribute__((enable_if(true, "")));
+void baz(int a) __attribute__((enable_if(a, "")));
+void baz(int a) __attribute__((enable_if(false, "")));
+
+void qux(int a) __attribute__((enable_if(1, "")));
+void qux(int a) __attribute__((enable_if(true, "")));
+void qux(int a) __attribute__((enable_if(a, "")));
+void qux(int a) __attribute__((enable_if(false, "")));
+
+template <typename Fn, typename... Args> void call(Fn F, Args... As) {
+  F(As...);
+}
+
+void test() {
+  call(foo); // expected-error{{cannot take address of function 'foo'}}
+  call(bar);
+  call(baz, 0);
+  call(qux, 0); // expected-error{{no matching function for call to 'call'}} expected-note@53{{candidate template ignored: couldn't infer template argument 'Fn'}}
+
+  auto Ptr1 = foo; // expected-error{{cannot take address of function 'foo'}}
+  auto Ptr2 = bar;
+  auto Ptr3 = baz;
+  auto Ptr4 = qux; // expected-error{{variable 'Ptr4' with type 'auto' has incompatible initializer of type '<overloaded function type>'}}
+}
+
+template <typename Fn, typename T, typename... Args>
+void callMem(Fn F, T t, Args... As) {
+  (t.*F)(As...);
+}
+
+class Foo {
+  void bar() __attribute__((enable_if(true, "")));
+  void bar() __attribute__((enable_if(false, "")));
+
+  static void staticBar() __attribute__((enable_if(true, "")));
+  static void staticBar() __attribute__((enable_if(false, "")));
+};
+
+void testAccess() {
+  callMem(&Foo::bar, Foo()); // expected-error{{'bar' is a private member of 'template_deduction::Foo'}} expected-note@-8{{implicitly declared private here}}
+  call(&Foo::staticBar); // expected-error{{'staticBar' is a private member of 'template_deduction::Foo'}} expected-note@-6{{implicitly declared private here}}
+}
+}
+
+namespace template_template_deduction {
+void foo() __attribute__((enable_if(false, "")));
+template <typename T>
+T foo() __attribute__((enable_if(true, "")));
+
+template <typename Fn, typename... Args> auto call(Fn F, Args... As) {
+  return F(As...);
+}
+
+auto Ok = call(&foo<int>);
+auto Fail = call(&foo); // expected-error{{no matching function for call to 'call'}} expected-note@-5{{candidate template ignored: couldn't infer template argument 'Fn'}}
+
+auto PtrOk = &foo<int>;
+auto PtrFail = &foo; // expected-error{{variable 'PtrFail' with type 'auto' has incompatible initializer of type '<overloaded function type>'}}
+}
+
+namespace pointer_equality {
+  using FnTy = void (*)();
+
+  void bothEnableIf() __attribute__((enable_if(false, "")));
+  void bothEnableIf() __attribute__((enable_if(true, "")));
+
+  void oneEnableIf() __attribute__((enable_if(false, "")));
+  void oneEnableIf();
+
+  void test() {
+    FnTy Fn;
+    (void)(Fn == bothEnableIf);
+    (void)(Fn == &bothEnableIf);
+    (void)(Fn == oneEnableIf);
+    (void)(Fn == &oneEnableIf);
+  }
+
+  void unavailableEnableIf() __attribute__((enable_if(false, "")));
+  void unavailableEnableIf() __attribute__((unavailable("noooo"))); // expected-note 2{{marked unavailable here}}
+
+  void testUnavailable() {
+    FnTy Fn;
+    (void)(Fn == unavailableEnableIf); // expected-error{{is unavailable}}
+    (void)(Fn == &unavailableEnableIf); // expected-error{{is unavailable}}
+  }
+
+  class Foo {
+    static void staticAccessEnableIf(); // expected-note 2{{declared private here}}
+    void accessEnableIf(); // expected-note{{declared private here}}
+
+  public:
+    static void staticAccessEnableIf() __attribute__((enable_if(false, "")));
+    void accessEnableIf() __attribute__((enable_if(false, "")));
+  };
+
+  void testAccess() {
+    FnTy Fn;
+    (void)(Fn == Foo::staticAccessEnableIf); // expected-error{{is a private member}}
+    (void)(Fn == &Foo::staticAccessEnableIf); // expected-error{{is a private member}}
+
+    void (Foo::*MemFn)();
+    (void)(MemFn == &Foo::accessEnableIf); // expected-error{{is a private member}}
+  }
+}
diff --git a/test/SemaCXX/undefined-inline.cpp b/test/SemaCXX/undefined-inline.cpp
index 18973ef8b79c8..feb12f4552cab 100644
--- a/test/SemaCXX/undefined-inline.cpp
+++ b/test/SemaCXX/undefined-inline.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -triple i686-pc-win32 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple i686-pc-win32 -verify -std=c++11 %s
 // PR14993
 
 namespace test1 {
@@ -61,3 +61,8 @@ namespace test11 {
   inline void bar() __attribute__((dllimport));
   void test() { foo(); bar(); }
 }
+
+namespace test12 {
+  template<typename> constexpr int _S_chk(int *);
+  decltype(_S_chk<int>(nullptr)) n;
+}
diff --git a/test/SemaCXX/undefined-internal.cpp b/test/SemaCXX/undefined-internal.cpp
index 29ca5de6d41cd..59e6fdf9af06c 100644
--- a/test/SemaCXX/undefined-internal.cpp
+++ b/test/SemaCXX/undefined-internal.cpp
@@ -82,6 +82,7 @@ namespace test5 {
     static int var; // expected-warning {{variable 'test5::B<test5::(anonymous namespace)::A>::var' has internal linkage but is not defined}}
     static void foo(); // expected-warning {{function 'test5::B<test5::(anonymous namespace)::A>::foo' has internal linkage but is not defined}}
   };
+  extern template int B<A>::var;
 
   void test() {
     B<A>::var = 0; // expected-note {{used here}}
diff --git a/test/SemaCXX/underlying_type.cpp b/test/SemaCXX/underlying_type.cpp
index 61208c72af048..dd019ae6e2775 100644
--- a/test/SemaCXX/underlying_type.cpp
+++ b/test/SemaCXX/underlying_type.cpp
@@ -55,3 +55,10 @@ namespace PR19966 {
     // expected-error@-2 {{constant expression}}
   };
 }
+
+template<typename T> void f(__underlying_type(T));
+template<typename T> void f(__underlying_type(T));
+enum E {};
+void PR26014() { f<E>(0); } // should not yield an ambiguity error.
+
+template<typename ...T> void f(__underlying_type(T) v); // expected-error {{declaration type contains unexpanded parameter pack 'T'}}
diff --git a/test/SemaCXX/unknown-anytype.cpp b/test/SemaCXX/unknown-anytype.cpp
index a07ec83d10a0e..95ad040934051 100644
--- a/test/SemaCXX/unknown-anytype.cpp
+++ b/test/SemaCXX/unknown-anytype.cpp
@@ -45,3 +45,14 @@ namespace test4 {
     int x = (int) test1; // expected-error {{function 'test1' with unknown type must be given a function type}}
   }
 }
+
+// rdar://problem/23959960
+namespace test5 {
+  template<typename T> struct X; // expected-note{{template is declared here}}
+
+  extern __unknown_anytype test0(...);
+
+  void test() {
+    (X<int>)test0(); // expected-error{{implicit instantiation of undefined template 'test5::X<int>'}}
+  }
+}
diff --git a/test/SemaCXX/unused.cpp b/test/SemaCXX/unused.cpp
index fbaf8c8bf3c12..09a179e7bb851 100644
--- a/test/SemaCXX/unused.cpp
+++ b/test/SemaCXX/unused.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // PR4103 : Make sure we don't get a bogus unused expression warning
 namespace PR4103 {
@@ -28,8 +30,14 @@ namespace PR4103 {
 
 namespace derefvolatile {
   void f(volatile char* x) {
-    *x; // expected-warning {{expression result unused; assign into a variable to force a volatile load}}
-    (void)*x; // expected-warning {{expression result unused; assign into a variable to force a volatile load}}
+    *x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused; assign into a variable to force a volatile load}}
+#endif
+    (void)*x;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused; assign into a variable to force a volatile load}}
+#endif
     volatile char y = 10;
     (void)y; // don't warn here, because it's a common pattern.
   }
diff --git a/test/SemaCXX/using-decl-1.cpp b/test/SemaCXX/using-decl-1.cpp
index ec45b3109a0b6..93f38f28e7785 100644
--- a/test/SemaCXX/using-decl-1.cpp
+++ b/test/SemaCXX/using-decl-1.cpp
@@ -243,6 +243,41 @@ namespace PR19171 {
   struct F : E {
     using E::EE; // expected-error-re {{no member named 'EE' in 'PR19171::E'{{$}}}}
   };
+
+  struct TypoDuplicate { // expected-note 0-4{{here}}
+    TypoDuplicate(int);
+    void foobar(); // expected-note 2{{here}}
+  };
+  struct TypoDuplicateDerived1 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoDuplicate::TypoFuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-note {{previous}}
+    using TypoDuplicate::TypoDuplicate; // expected-error {{redeclaration}}
+#endif
+    using TypoDuplicate::goobar; // expected-error {{did you mean 'foobar'}} expected-note {{previous}}
+    using TypoDuplicate::foobar; // expected-error {{redeclaration}}
+  };
+  struct TypoDuplicateDerived2 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoFuplicate::TypoDuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-note {{previous}}
+    using TypoDuplicate::TypoDuplicate; // expected-error {{redeclaration}}
+#endif
+  };
+  struct TypoDuplicateDerived3 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    // FIXME: Don't suggest a correction that would lead to a redeclaration
+    // error here... or at least diagnose the error.
+    using TypoDuplicate::TypoDuplicate;
+    using TypoDuplicate::TypoFuplicate; // expected-error {{did you mean 'TypoDuplicate'}}
+#endif
+    using TypoDuplicate::foobar;
+    using TypoDuplicate::goobar; // expected-error {{did you mean 'foobar'}}
+  };
+  struct TypoDuplicateDerived4 : TypoDuplicate {
+#if __cplusplus >= 201103L
+    using TypoDuplicate::TypoDuplicate; // expected-note {{previous}}
+    using TypoFuplicate::TypoDuplicate; // expected-error {{did you mean 'TypoDuplicate'}} expected-error {{redeclaration}}
+#endif
+  };
 }
 
 namespace TypoCorrectTemplateMember {
@@ -338,3 +373,26 @@ struct B : A {
   enum { X = sizeof(field) };
 };
 }
+
+namespace tag_vs_var {
+  namespace N {
+    struct X {};
+
+    struct Y {};
+    int Y;
+
+    int Z;
+  }
+  using N::X;
+  using N::Y;
+  using N::Z;
+
+  namespace N {
+    int X;
+
+    struct Z {};
+  }
+  using N::X;
+  using N::Y;
+  using N::Z;
+}
diff --git a/test/SemaCXX/using-decl-templates.cpp b/test/SemaCXX/using-decl-templates.cpp
index 8314688bcbc7e..d766bb3ac6bf5 100644
--- a/test/SemaCXX/using-decl-templates.cpp
+++ b/test/SemaCXX/using-decl-templates.cpp
@@ -90,5 +90,5 @@ namespace aliastemplateinst {
   template<typename T> struct A { };
   template<typename T> using APtr = A<T*>; // expected-note{{previous use is here}}
 
-  template struct APtr<int>; // expected-error{{elaborated type refers to a non-tag type}}
+  template struct APtr<int>; // expected-error{{elaborated type refers to a type alias template}}
 }
diff --git a/test/SemaCXX/vararg-non-pod.cpp b/test/SemaCXX/vararg-non-pod.cpp
index 39d4cccacd355..1b7f3b68dc970 100644
--- a/test/SemaCXX/vararg-non-pod.cpp
+++ b/test/SemaCXX/vararg-non-pod.cpp
@@ -1,7 +1,11 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -fblocks %s -Wno-error=non-pod-varargs
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++98 %s -Wno-error=non-pod-varargs
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++11 %s -Wno-error=non-pod-varargs
 
 // Check that the warning is still there under -fms-compatibility.
 // RUN: %clang_cc1 -fsyntax-only -verify -fblocks %s -Wno-error=non-pod-varargs -fms-compatibility
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++98 %s -Wno-error=non-pod-varargs -fms-compatibility
+// RUN: %clang_cc1 -fsyntax-only -verify -fblocks -std=c++11 %s -Wno-error=non-pod-varargs -fms-compatibility
 
 extern char version[];
 
@@ -18,11 +22,19 @@ void t1()
 {
   C c(10);
   
-  g(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  g(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   g(10, version);
 
   void (*ptr)(int, ...) = g;
-  ptr(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  ptr(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   ptr(10, version);
 }
 
@@ -30,18 +42,34 @@ void t2()
 {
   C c(10);
 
-  c.g(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  c.g(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   c.g(10, version);
 
   void (C::*ptr)(int, ...) = &C::g;
-  (c.*ptr)(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  (c.*ptr)(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   (c.*ptr)(10, version);
  
-  C::h(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  C::h(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   C::h(10, version);
 
   void (*static_ptr)(int, ...) = &C::h; 
-  static_ptr(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+  static_ptr(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
   static_ptr(10, version);
 }
 
@@ -51,7 +79,11 @@ void t3()
 {
   C c(10);
   
-  block(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+  block(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
+
   block(10, version);
 }
 
@@ -66,7 +98,11 @@ void t4()
 
   D d;
   
-  d(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+  d(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
   d(10, version);
 }
 
@@ -78,10 +114,16 @@ void t5()
 {
   C c(10);
   
-  E e(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}} \
-    // expected-error{{calling a private constructor of class 'E'}}
-  (void)E(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}} \
-    // expected-error{{calling a private constructor of class 'E'}}
+  E e(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}}
+#endif
+  // expected-error@-4 {{calling a private constructor of class 'E'}}
+  (void)E(10, c);
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic constructor; call will abort at runtime}}
+#endif
+  // expected-error@-4 {{calling a private constructor of class 'E'}}
 
 }
 
@@ -103,7 +145,13 @@ Base &get_base(...);
 int eat_base(...);
 
 void test_typeid(Base &base) {
-  (void)typeid(get_base(base)); // expected-warning{{cannot pass object of non-POD type 'Base' through variadic function; call will abort at runtime}} expected-warning{{expression with side effects will be evaluated despite being used as an operand to 'typeid'}}
+  (void)typeid(get_base(base));
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'Base' through variadic function; call will abort at runtime}}
+#else
+  // expected-warning@-4 {{cannot pass object of non-trivial type 'Base' through variadic function; call will abort at runtime}}
+#endif
+  // expected-warning@-6 {{expression with side effects will be evaluated despite being used as an operand to 'typeid'}}
   (void)typeid(eat_base(base)); // okay
 }
 
@@ -136,7 +184,10 @@ void t8(int n, ...) {
 
 int t9(int n) {
   // Make sure the error works in potentially-evaluated sizeof
-  return (int)sizeof(*(Helper(Foo()), (int (*)[n])0)); // expected-warning{{cannot pass object of non-POD type}}
+  return (int)sizeof(*(Helper(Foo()), (int (*)[n])0));
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{cannot pass object of non-POD type 'Foo' through variadic function; call will abort at runtime}}
+#endif
 }
 
 // PR14057
@@ -173,22 +224,43 @@ namespace t11 {
   void test() {
     C c(10);
 
-    (get_f_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+    (get_f_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
     (get_f_ptr())(10, version);
 
-    (c.*get_m_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+    (c.*get_m_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
     (c.*get_m_ptr())(10, version);
 
-    (get_b_ptr())(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+    (get_b_ptr())(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
+
     (get_b_ptr())(10, version);
 
-    (arr_f_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+    (arr_f_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic function; call will abort at runtime}}
+#endif
+
     (arr_f_ptr[3])(10, version);
 
-    (c.*arr_m_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+    (c.*arr_m_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic method; call will abort at runtime}}
+#endif
+
     (c.*arr_m_ptr[3])(10, version);
 
-    (arr_b_ptr[3])(10, c); // expected-warning{{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+    (arr_b_ptr[3])(10, c);
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{cannot pass object of non-POD type 'C' through variadic block; call will abort at runtime}}
+#endif
     (arr_b_ptr[3])(10, version);
   }
 }
diff --git a/test/SemaCXX/varargs.cpp b/test/SemaCXX/varargs.cpp
new file mode 100644
index 0000000000000..6a1883786a33a
--- /dev/null
+++ b/test/SemaCXX/varargs.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fsyntax-only -std=c++03 -verify %s
+
+class string;
+void f(const string& s, ...) {  // expected-note {{parameter of type 'const string &' is declared here}}
+  __builtin_va_list ap;
+  __builtin_va_start(ap, s); // expected-warning {{passing an object of reference type to 'va_start' has undefined behavior}}
+}
+
+void g(register int i, ...) {
+  __builtin_va_list ap;
+  __builtin_va_start(ap, i); // okay
+}
diff --git a/test/SemaCXX/vartemplate-lambda.cpp b/test/SemaCXX/vartemplate-lambda.cpp
new file mode 100644
index 0000000000000..9dab6da3d1e3d
--- /dev/null
+++ b/test/SemaCXX/vartemplate-lambda.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+template <class> auto fn0 = [] {};
+template <typename> void foo0() { fn0<char>(); }
+
+template<typename T> auto fn1 = [](auto a) { return a + T(1); };
+
+template <typename X>
+int foo2() {
+  X a = 0x61;
+  fn1<char>(a);
+  return 0;
+}
+
+int main() {
+  foo2<int>();
+}
diff --git a/test/SemaCXX/vla-consruct.cpp b/test/SemaCXX/vla-consruct.cpp
new file mode 100644
index 0000000000000..09b73704eb077
--- /dev/null
+++ b/test/SemaCXX/vla-consruct.cpp
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -O0 -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fcxx-exceptions -fexceptions -pedantic-errors -DPE -O0 -verify %s
+
+# ifndef PE
+// expected-no-diagnostics
+# endif
+
+extern "C" int printf(const char*, ...);
+
+static int N;
+struct S {
+  S() __attribute__ ((nothrow))  { printf("%d: S()\n", ++N); }
+  ~S()  __attribute__ ((nothrow))  { printf("%d: ~S()\n", N--); }
+  int n[17];
+};
+
+void print(int n, int a, int b, int c, int d) {
+  printf("n=%d\n,sizeof(S)=%d\nsizeof(array_t[0][0])=%d\nsizeof(array_t[0])=%d\nsizeof(array_t)=%d\n",
+         n, a, b, c, d);
+  if (n == 2) throw(n);
+}
+
+void test(int n) {
+  S array_t[n][n+1];
+# ifdef PE
+   // expected-error@-2 {{variable length arrays are a C99 feature}}
+   // expected-error@-3 {{variable length arrays are a C99 feature}}
+# endif
+  int sizeof_S = sizeof(S);
+  int sizeof_array_t_0_0 = sizeof(array_t[0][0]);
+  int sizeof_array_t_0 = sizeof(array_t[0]);
+  int sizeof_array_t = sizeof(array_t);
+  print(n, sizeof_S, sizeof_array_t_0_0, sizeof_array_t_0, sizeof_array_t);
+}
+
+int main()
+{
+  try {
+    test(2);
+  } catch(int e) {
+    printf("expeption %d\n", e);
+  }
+  try {
+    test(3);
+  } catch(int e) {
+    printf("expeption %d", e);
+  }
+}
diff --git a/test/SemaCXX/warn-bad-memaccess.cpp b/test/SemaCXX/warn-bad-memaccess.cpp
index 67cde10bf45dc..55ce4a0da031b 100644
--- a/test/SemaCXX/warn-bad-memaccess.cpp
+++ b/test/SemaCXX/warn-bad-memaccess.cpp
@@ -141,3 +141,16 @@ namespace N {
     N::memset(&x1, 0, sizeof x1);
   }
 }
+
+namespace recursive_class {
+struct S {
+  S v;
+  // expected-error@-1{{field has incomplete type 'recursive_class::S'}}
+  // expected-note@-3{{definition of 'recursive_class::S' is not complete until the closing '}'}}
+} a;
+
+int main() {
+  __builtin_memset(&a, 0, sizeof a);
+  return 0;
+}
+}
diff --git a/test/SemaCXX/warn-comma-operator.cpp b/test/SemaCXX/warn-comma-operator.cpp
new file mode 100644
index 0000000000000..3192f688f1ba0
--- /dev/null
+++ b/test/SemaCXX/warn-comma-operator.cpp
@@ -0,0 +1,278 @@
+// RUN: %clang_cc1 -fsyntax-only -Wcomma -std=c++11 -verify %s
+// RUN: %clang_cc1 -fsyntax-only -Wcomma -std=c++11 -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+
+// Test builtin operators
+void test1() {
+  int x = 0, y = 0;
+  for (; y < 10; x++, y++) {}
+  for (; y < 10; ++x, y++) {}
+  for (; y < 10; x++, ++y) {}
+  for (; y < 10; ++x, ++y) {}
+  for (; y < 10; x--, ++y) {}
+  for (; y < 10; --x, ++y) {}
+  for (; y < 10; x = 5, ++y) {}
+  for (; y < 10; x *= 5, ++y) {}
+  for (; y < 10; x /= 5, ++y) {}
+  for (; y < 10; x %= 5, ++y) {}
+  for (; y < 10; x += 5, ++y) {}
+  for (; y < 10; x -= 5, ++y) {}
+  for (; y < 10; x <<= 5, ++y) {}
+  for (; y < 10; x >>= 5, ++y) {}
+  for (; y < 10; x &= 5, ++y) {}
+  for (; y < 10; x |= 5, ++y) {}
+  for (; y < 10; x ^= 5, ++y) {}
+}
+
+class S2 {
+public:
+  void advance();
+
+  S2 operator++();
+  S2 operator++(int);
+  S2 operator--();
+  S2 operator--(int);
+  S2 operator=(int);
+  S2 operator*=(int);
+  S2 operator/=(int);
+  S2 operator%=(int);
+  S2 operator+=(int);
+  S2 operator-=(int);
+  S2 operator<<=(int);
+  S2 operator>>=(int);
+  S2 operator&=(int);
+  S2 operator|=(int);
+  S2 operator^=(int);
+};
+
+// Test overloaded operators
+void test2() {
+  S2 x;
+  int y;
+  for (; y < 10; x++, y++) {}
+  for (; y < 10; ++x, y++) {}
+  for (; y < 10; x++, ++y) {}
+  for (; y < 10; ++x, ++y) {}
+  for (; y < 10; x--, ++y) {}
+  for (; y < 10; --x, ++y) {}
+  for (; y < 10; x = 5, ++y) {}
+  for (; y < 10; x *= 5, ++y) {}
+  for (; y < 10; x /= 5, ++y) {}
+  for (; y < 10; x %= 5, ++y) {}
+  for (; y < 10; x += 5, ++y) {}
+  for (; y < 10; x -= 5, ++y) {}
+  for (; y < 10; x <<= 5, ++y) {}
+  for (; y < 10; x >>= 5, ++y) {}
+  for (; y < 10; x &= 5, ++y) {}
+  for (; y < 10; x |= 5, ++y) {}
+  for (; y < 10; x ^= 5, ++y) {}
+}
+
+// Test nested comma operators
+void test3() {
+  int x1, x2, x3;
+  int y1, *y2 = 0, y3 = 5;
+  for (int z1 = 5, z2 = 4, z3 = 3; x1 <4; ++x1) {}
+}
+
+class Stream {
+ public:
+  Stream& operator<<(int);
+} cout;
+
+int return_four() { return 5; }
+
+// Confusing "," for "<<"
+void test4() {
+  cout << 5 << return_four();
+  cout << 5, return_four();
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:12-[[@LINE-4]]:12}:")"
+}
+
+// Confusing "," for "=="
+void test5() {
+  if (return_four(), 5) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:7-[[@LINE-3]]:7}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:20-[[@LINE-4]]:20}:")"
+
+  if (return_four() == 5) {}
+}
+
+// Confusing "," for "+"
+int test6() {
+  return return_four(), return_four();
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  return return_four() + return_four();
+}
+
+void Concat(int);
+void Concat(int, int);
+
+// Testing extra parentheses in function call
+void test7() {
+  Concat((return_four() , 5));
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:24-[[@LINE-4]]:24}:")"
+
+  Concat(return_four() , 5);
+}
+
+// Be sure to look through parentheses
+void test8() {
+  int x, y;
+  for (x = 0; return_four(), x;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:15-[[@LINE-3]]:15}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:28-[[@LINE-4]]:28}:")"
+
+  for (x = 0; (return_four()), (x) ;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:15-[[@LINE-3]]:15}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:30-[[@LINE-4]]:30}:")"
+}
+
+bool DoStuff();
+class S9 {
+public:
+ bool Advance();
+ bool More();
+};
+
+// Ignore comma operator in for-loop initializations and increments.
+void test9() {
+  int x, y;
+  for (x = 0, y = 5; x < y; ++x) {}
+  for (x = 0; x < 10; DoStuff(), ++x) {}
+  for (S9 s; s.More(); s.Advance(), ++x) {}
+}
+
+void test10() {
+  int x, y;
+  ++x, ++y;
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:6-[[@LINE-4]]:6}:")"
+}
+
+// Ignore comma operator in templates.
+namespace test11 {
+template <bool T>
+struct B { static const bool value = T; };
+
+typedef B<true> true_type;
+typedef B<false> false_type;
+
+template <bool...>
+struct bool_seq;
+
+template <typename... xs>
+class Foo {
+  typedef bool_seq<(xs::value, true)...> all_true;
+  typedef bool_seq<(xs::value, false)...> all_false;
+  typedef bool_seq<xs::value...> seq;
+};
+
+const auto X = Foo<true_type>();
+}
+
+namespace test12 {
+class Mutex {
+ public:
+  Mutex();
+  ~Mutex();
+};
+class MutexLock {
+public:
+  MutexLock(Mutex &);
+  MutexLock();
+  ~MutexLock();
+};
+class BuiltinMutex {
+  Mutex M;
+};
+Mutex StatusMutex;
+bool Status;
+
+bool get_status() {
+  return (MutexLock(StatusMutex), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:33-[[@LINE-4]]:33}:")"
+  return (MutexLock(), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:22-[[@LINE-4]]:22}:")"
+  return (BuiltinMutex(), Status);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:11-[[@LINE-3]]:11}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:25-[[@LINE-4]]:25}:")"
+}
+}
+
+// Check for comma operator in conditions.
+void test13(int x) {
+  x = (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:8-[[@LINE-3]]:8}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:21-[[@LINE-4]]:21}:")"
+
+  int y = (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:12-[[@LINE-3]]:12}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:25-[[@LINE-4]]:25}:")"
+
+  for (; return_four(), x;) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  while (return_four(), x) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:10-[[@LINE-3]]:10}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:23-[[@LINE-4]]:23}:")"
+
+  if (return_four(), x) {}
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:7-[[@LINE-3]]:7}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:20-[[@LINE-4]]:20}:")"
+
+  do { } while (return_four(), x);
+  // expected-warning@-1{{comma operator}}
+  // expected-note@-2{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:17-[[@LINE-3]]:17}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:30-[[@LINE-4]]:30}:")"
+}
+
+// Nested comma operator with fix-its.
+void test14() {
+  return_four(), return_four(), return_four(), return_four();
+  // expected-warning@-1 3{{comma operator}}
+  // expected-note@-2 3{{cast expression to void}}
+  // CHECK: fix-it:{{.*}}:{[[@LINE-3]]:3-[[@LINE-3]]:3}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-4]]:16-[[@LINE-4]]:16}:")"
+  // CHECK: fix-it:{{.*}}:{[[@LINE-5]]:18-[[@LINE-5]]:18}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-6]]:31-[[@LINE-6]]:31}:")"
+  // CHECK: fix-it:{{.*}}:{[[@LINE-7]]:33-[[@LINE-7]]:33}:"static_cast<void>("
+  // CHECK: fix-it:{{.*}}:{[[@LINE-8]]:46-[[@LINE-8]]:46}:")"
+}
diff --git a/test/SemaCXX/warn-float-conversion.cpp b/test/SemaCXX/warn-float-conversion.cpp
index 22c33040b26b8..fc221893aeeea 100644
--- a/test/SemaCXX/warn-float-conversion.cpp
+++ b/test/SemaCXX/warn-float-conversion.cpp
@@ -1,5 +1,10 @@
-// RUN: %clang_cc1 -verify -fsyntax-only %s -Wfloat-conversion
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-literal-conversion -Wfloat-conversion -DFLOAT_CONVERSION -DZERO -DBOOL -DCONSTANT_BOOL -DOVERFLOW
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-conversion -Wfloat-overflow-conversion -DOVERFLOW
+// RUN: %clang_cc1 -verify -fsyntax-only -triple x86_64-pc-linux-gnu %s -Wno-conversion -Wfloat-zero-conversion -DZERO
 
+float ReturnFloat();
+
+#ifdef FLOAT_CONVERSION
 bool ReturnBool(float f) {
   return f;  //expected-warning{{conversion}}
 }
@@ -36,3 +41,49 @@ void Convert(float f, double d, long double ld) {
   l = ld;  //expected-warning{{conversion}}
 }
 
+void Test() {
+  int a1 = 10.0/2.0;  //expected-warning{{conversion}}
+  int a2 = 1.0/2.0;  //expected-warning{{conversion}}
+  bool a3 = ReturnFloat();  //expected-warning{{conversion}}
+  int a4 = 1e30 + 1;  //expected-warning{{conversion}}
+}
+
+void TestConstantFloat() {
+  // Don't warn on exact floating literals.
+  int a1 = 5.0;
+  int a2 = 1e3;
+
+  int a3 = 5.5;  // caught by -Wliteral-conversion
+  int a4 = 500.44;  // caught by -Wliteral-convserion
+
+  int b1 = 5.0 / 1.0;  //expected-warning{{conversion}}
+  int b2 = 5.0 / 2.0;  //expected-warning{{conversion}}
+
+  const float five = 5.0;
+
+  int b3 = five / 1.0;  //expected-warning{{conversion}}
+  int b4 = five / 2.0;  //expected-warning{{conversion}}
+}
+#endif  // FLOAT_CONVERSION
+
+#ifdef ZERO
+void TestZero() {
+  const float half = .5;
+  int a1 = half;  // expected-warning{{implicit conversion from 'const float' to 'int' changes non-zero value from 0.5 to 0}}
+  int a2 = 1.0 / 2.0;  // expected-warning{{implicit conversion from 'double' to 'int' changes non-zero value from 0.5 to 0}}
+  int a3 = 5;
+}
+#endif  // ZERO
+
+#ifdef OVERFLOW
+void TestOverflow() {
+  char a = 500.0;  // caught by -Wliteral-conversion
+  char b = -500.0;  // caught by -Wliteral-conversion
+
+  const float LargeNumber = 1024;
+  char c = LargeNumber;  // expected-warning{{implicit conversion of out of range value from 'const float' to 'char' changes value from 1024 to 127}}
+  char d = 400.0 + 400.0;  // expected-warning{{implicit conversion of out of range value from 'double' to 'char' changes value from 800 to 127}}
+
+  char e = 1.0 / 0.0;  // expected-warning{{implicit conversion of out of range value from 'double' to 'char' changes value from +Inf to 127}}
+}
+#endif  // OVERFLOW
diff --git a/test/SemaCXX/warn-literal-conversion.cpp b/test/SemaCXX/warn-literal-conversion.cpp
index 5d4b6f7f5bf85..875aa1dec5efd 100644
--- a/test/SemaCXX/warn-literal-conversion.cpp
+++ b/test/SemaCXX/warn-literal-conversion.cpp
@@ -25,7 +25,7 @@ void test0() {
   // Test passing a literal floating-point value to a function that takes an integer.
   foo(1.2F); // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.2 to 1}}
 
-  int y10 = -1.2F;  // expected-warning {{implicit conversion from 'float' to 'int' changes value from 1.2 to 1}}
+  int y10 = -1.2F;  // expected-warning {{implicit conversion from 'float' to 'int' changes value from -1.2 to -1}}
 
   // -Wliteral-conversion does NOT catch const values.
   // (-Wconversion DOES catch them.)
diff --git a/test/SemaCXX/warn-loop-analysis.cpp b/test/SemaCXX/warn-loop-analysis.cpp
index c666c48fc017f..25ec7a7862edc 100644
--- a/test/SemaCXX/warn-loop-analysis.cpp
+++ b/test/SemaCXX/warn-loop-analysis.cpp
@@ -260,3 +260,9 @@ void test8() {
     i--;
   }
 }
+
+int f(int);
+void test9() {
+  // Don't warn when variable is defined by the loop condition.
+  for (int i = 0; int x = f(i); ++i) {}
+}
diff --git a/test/SemaCXX/warn-shadow.cpp b/test/SemaCXX/warn-shadow.cpp
index 5ad2233d234a0..9d68fe7c47a66 100644
--- a/test/SemaCXX/warn-shadow.cpp
+++ b/test/SemaCXX/warn-shadow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -verify -fsyntax-only -Wshadow %s
+// RUN: %clang_cc1 -verify -fsyntax-only -Wshadow-all %s
 
 namespace {
   int i; // expected-note {{previous declaration is here}}
@@ -29,7 +29,23 @@ void foo() {
 
 class A {
   static int data; // expected-note {{previous declaration}}
-  int field; // expected-note {{previous declaration}}
+  // expected-note@+1 {{previous declaration}}
+  int field;
+  int f1, f2, f3, f4; // expected-note 8 {{previous declaration is here}}
+
+  // The initialization is safe, but the modifications are not.
+  A(int f1, int f2, int f3, int f4) // expected-note-re 4 {{variable 'f{{[0-4]}}' is declared here}}
+	  : f1(f1) {
+    f1 = 3; // expected-warning {{modifying constructor parameter 'f1' that shadows a field of 'A'}}
+    f1 = 4; // one warning per shadow
+    f2++; // expected-warning {{modifying constructor parameter 'f2' that shadows a field of 'A'}}
+    --f3; // expected-warning {{modifying constructor parameter 'f3' that shadows a field of 'A'}}
+    f4 += 2; // expected-warning {{modifying constructor parameter 'f4' that shadows a field of 'A'}}
+  }
+
+  // The initialization is safe, but the modifications are not.
+  // expected-warning-re@+1 4 {{constructor parameter 'f{{[0-4]}}' shadows the field 'f{{[0-9]}}' of 'A'}}
+  A(int f1, int f2, int f3, int f4, double overload_dummy) {}
 
   void test() {
     char *field; // expected-warning {{declaration shadows a field of 'A'}}
diff --git a/test/SemaCXX/warn-unused-private-field.cpp b/test/SemaCXX/warn-unused-private-field.cpp
index 932a7dcea1daa..fb34fa98eaf67 100644
--- a/test/SemaCXX/warn-unused-private-field.cpp
+++ b/test/SemaCXX/warn-unused-private-field.cpp
@@ -128,6 +128,7 @@ class EverythingUsed {
     int *use = &by_reference_;
     int test[2];
     test[as_array_index_] = 42;
+    int EverythingUsed::*ptr = &EverythingUsed::by_pointer_to_member_;
   }
 
   template<class T>
@@ -142,6 +143,7 @@ class EverythingUsed {
   int by_template_function_;
   int as_array_index_;
   int by_initializer_;
+  int by_pointer_to_member_;
 };
 
 class HasFeatureTest {
diff --git a/test/SemaCXX/warn-unused-value.cpp b/test/SemaCXX/warn-unused-value.cpp
index efabd50630681..d6ec0fb5d1cdb 100644
--- a/test/SemaCXX/warn-unused-value.cpp
+++ b/test/SemaCXX/warn-unused-value.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -Wunused-value -std=c++11 %s
 
 // PR4806
 namespace test0 {
@@ -12,7 +14,10 @@ namespace test0 {
     // pointer to volatile has side effect (thus no warning)
     Box* box = new Box;
     box->i; // expected-warning {{expression result unused}}
-    box->j; // expected-warning {{expression result unused}}
+    box->j;
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{expression result unused}}
+#endif
   }
 }
 
diff --git a/test/SemaObjC/Inputs/arc-system-header.h b/test/SemaObjC/Inputs/arc-system-header.h
index 5012a2a378374..9decc5efce6e9 100644
--- a/test/SemaObjC/Inputs/arc-system-header.h
+++ b/test/SemaObjC/Inputs/arc-system-header.h
@@ -50,3 +50,8 @@ extern struct Test6 *const kMagicConstant;
 static inline void *test8(id ptr) {
   return (__bridge_retain void*) ptr;
 }
+
+typedef struct {
+  const char *name;
+  id field;
+} Test9;
diff --git a/test/SemaObjC/arc-objc-lifetime-conflict.m b/test/SemaObjC/arc-objc-lifetime-conflict.m
new file mode 100644
index 0000000000000..218950ca7b501
--- /dev/null
+++ b/test/SemaObjC/arc-objc-lifetime-conflict.m
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin11 -fobjc-arc -fobjc-runtime-has-weak %s -emit-llvm -o - | FileCheck %s
+
+// CHECK: bitcast {{.*}} %self_weak_s_w_s
+// CHECK-NEXT: objc_destroyWeak
+// CHECK-NEXT: bitcast {{.*}} %self_strong_w_s
+// CHECK-NEXT: objc_storeStrong
+// CHECK-NEXT: bitcast {{.*}} %self_weak_s
+// CHECK-NEXT: objc_destroyWeak
+// CHECK-NEXT: bitcast {{.*}} %self_weak_s3
+// CHECK-NEXT: objc_destroyWeak
+// CHECK-NEXT: bitcast {{.*}} %self_strong3
+// CHECK-NEXT: objc_storeStrong
+// CHECK-NEXT: bitcast {{.*}} %self_strong2
+// CHECK-NEXT: objc_storeStrong
+// CHECK-NEXT: bitcast {{.*}} %self_strong
+// CHECK-NEXT: objc_storeStrong
+@interface NSObject
+@end
+@interface A : NSObject
+@end
+@implementation A
+- (void)test {
+  __attribute__((objc_ownership(strong))) __typeof__(self) self_strong;
+  __attribute__((objc_ownership(strong))) __typeof__(self_strong) self_strong2;
+  __attribute__((objc_ownership(strong))) __typeof__(self_strong2) self_strong3;
+  __attribute__((objc_ownership(weak))) __typeof__(self_strong3) self_weak_s3;
+ 
+  __attribute__((objc_ownership(weak))) __typeof__(self_strong) self_weak_s;
+  __attribute__((objc_ownership(strong))) __typeof__(self_weak_s) self_strong_w_s;
+  __attribute__((objc_ownership(weak))) __typeof__(self_strong_w_s) self_weak_s_w_s;
+}
+@end
diff --git a/test/SemaObjC/arc-repeated-weak.mm b/test/SemaObjC/arc-repeated-weak.mm
index 264c598942aec..11161a0bf7fc0 100644
--- a/test/SemaObjC/arc-repeated-weak.mm
+++ b/test/SemaObjC/arc-repeated-weak.mm
@@ -439,3 +439,26 @@ void doubleLevelAccessIvar(Test *a, Test *b) {
 }
 @end
 
+// This used to crash in WeakObjectProfileTy::getBaseInfo when getBase() was
+// called on an ObjCPropertyRefExpr object whose receiver was an interface.
+
+@class NSString;
+@interface NSBundle
++(NSBundle *)foo;
+@property (class) NSBundle *foo2;
+@property NSString *prop;
+@property(weak) NSString *weakProp;
+@end
+
+@interface NSBundle2 : NSBundle
+@end
+
+void foo() {
+  NSString * t = NSBundle.foo.prop;
+  use(NSBundle.foo.weakProp); // expected-warning{{weak property 'weakProp' may be accessed multiple times}}
+  use(NSBundle2.foo.weakProp); // expected-note{{also accessed here}}
+
+  NSString * t2 = NSBundle.foo2.prop;
+  use(NSBundle.foo2.weakProp); // expected-warning{{weak property 'weakProp' may be accessed multiple times}}
+  use(NSBundle2.foo2.weakProp); // expected-note{{also accessed here}}
+}
diff --git a/test/SemaObjC/arc-system-header.m b/test/SemaObjC/arc-system-header.m
index acfd9a8585e37..68230e74b2d57 100644
--- a/test/SemaObjC/arc-system-header.m
+++ b/test/SemaObjC/arc-system-header.m
@@ -46,6 +46,13 @@ void test7(Test7 *p) {
 // expected-note@arc-system-header.h:41 4 {{declaration uses type that is ill-formed in ARC}}
 // expected-note@arc-system-header.h:41 2 {{property 'prop' is declared unavailable here}}
 }
+
+extern void doSomething(Test9 arg);
+void test9() {
+    Test9 foo2 = {0, 0}; // expected-error {{'field' is unavailable in ARC}}
+                         // expected-note@arc-system-header.h:56 {{field has non-trivial ownership qualification}}
+    doSomething(foo2);
+}
 #endif
 
 // test8 in header
diff --git a/test/SemaObjC/attr-availability-1.m b/test/SemaObjC/attr-availability-1.m
index d694cbd80da6e..d3d4000378530 100644
--- a/test/SemaObjC/attr-availability-1.m
+++ b/test/SemaObjC/attr-availability-1.m
@@ -25,19 +25,19 @@
 // rdar://11475360
 @interface B : A
 - (void)method; // NOTE: we expect 'method' to *not* inherit availability.
-- (void)overridden __attribute__((availability(macosx,introduced=10_4))); // expected-warning{{overriding method introduced after overridden method on OS X (10_4 vs. 10_3)}}
+- (void)overridden __attribute__((availability(macosx,introduced=10_4))); // expected-warning{{overriding method introduced after overridden method on macOS (10_4 vs. 10_3)}}
 - (void)overridden2 __attribute__((availability(macosx,introduced=10_2)));
 - (void)overridden3 __attribute__((availability(macosx,deprecated=10_4)));
-- (void)overridden4 __attribute__((availability(macosx,deprecated=10_2))); // expected-warning{{overriding method deprecated before overridden method on OS X (10_3 vs. 10_2)}}
+- (void)overridden4 __attribute__((availability(macosx,deprecated=10_2))); // expected-warning{{overriding method deprecated before overridden method on macOS (10_3 vs. 10_2)}}
 - (void)overridden5 __attribute__((availability(macosx,introduced=10_3)));
-- (void)overridden6 __attribute__((availability(macosx,unavailable))); // expected-warning{{overriding method cannot be unavailable on OS X when its overridden method is available}}
+- (void)overridden6 __attribute__((availability(macosx,unavailable))); // expected-warning{{overriding method cannot be unavailable on macOS when its overridden method is available}}
 @end
 
 void f(A *a, B *b) {
-  [a method]; // expected-warning{{'method' is deprecated: first deprecated in OS X 10.2}}
+  [a method]; // expected-warning{{'method' is deprecated: first deprecated in macOS 10.2}}
   [b method]; // no-warning
-  [a proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in OS X 10.2}}
-  [b proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in OS X 10.2}}
+  [a proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in macOS 10.2}}
+  [b proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in macOS 10.2}}
 }
 
 // Test case for <rdar://problem/11627873>.  Warn about
@@ -57,7 +57,7 @@ void f(A *a, B *b) {
 
 @implementation D
 - (void) method {
-  [super method]; // expected-warning {{'method' is deprecated: first deprecated in OS X 10.2}}
+  [super method]; // expected-warning {{'method' is deprecated: first deprecated in macOS 10.2}}
 }
 @end
 
@@ -112,9 +112,9 @@ id NSNibOwner, topNibObjects;
 @end
 
 void foo (A18804883* pa) {
-  [pa interface_method]; // expected-error {{'interface_method' is unavailable: not available on OS X}}
+  [pa interface_method]; // expected-error {{'interface_method' is unavailable: not available on macOS}}
   [pa proto_method];
-  [pa strange_method]; // expected-error {{'strange_method' is unavailable: not available on OS X}}
+  [pa strange_method]; // expected-error {{'strange_method' is unavailable: not available on macOS}}
   [pa always_available];
 }
 
diff --git a/test/SemaObjC/attr-availability.m b/test/SemaObjC/attr-availability.m
index dad2d5b7e794d..6cbb3cc22d2f6 100644
--- a/test/SemaObjC/attr-availability.m
+++ b/test/SemaObjC/attr-availability.m
@@ -30,32 +30,32 @@
 @interface B : A
 - (void)method; // NOTE: we expect 'method' to *not* inherit availability.
 - (void)partialMethod; // Likewise.
-- (void)overridden __attribute__((availability(macosx,introduced=10.4))); // expected-warning{{overriding method introduced after overridden method on OS X (10.4 vs. 10.3)}}
+- (void)overridden __attribute__((availability(macosx,introduced=10.4))); // expected-warning{{overriding method introduced after overridden method on macOS (10.4 vs. 10.3)}}
 - (void)overridden2 __attribute__((availability(macosx,introduced=10.2)));
 - (void)overridden3 __attribute__((availability(macosx,deprecated=10.4)));
-- (void)overridden4 __attribute__((availability(macosx,deprecated=10.2))); // expected-warning{{overriding method deprecated before overridden method on OS X (10.3 vs. 10.2)}}
+- (void)overridden4 __attribute__((availability(macosx,deprecated=10.2))); // expected-warning{{overriding method deprecated before overridden method on macOS (10.3 vs. 10.2)}}
 - (void)overridden5 __attribute__((availability(macosx,introduced=10.3)));
-- (void)overridden6 __attribute__((availability(macosx,unavailable))); // expected-warning{{overriding method cannot be unavailable on OS X when its overridden method is available}}
+- (void)overridden6 __attribute__((availability(macosx,unavailable))); // expected-warning{{overriding method cannot be unavailable on macOS when its overridden method is available}}
 - (void)unavailableMethod; // does *not* inherit unavailability
 @end
 
 void f(A *a, B *b) {
-  [a method]; // expected-warning{{'method' is deprecated: first deprecated in OS X 10.2}}
+  [a method]; // expected-warning{{'method' is deprecated: first deprecated in macOS 10.2}}
   [b method]; // no-warning
-  [a proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in OS X 10.2}}
-  [b proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in OS X 10.2}}
+  [a proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in macOS 10.2}}
+  [b proto_method]; // expected-warning{{'proto_method' is deprecated: first deprecated in macOS 10.2}}
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partialMethod' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'partialMethod' to silence this warning}}
+  // expected-warning@+2 {{'partialMethod' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partialMethod' to silence this warning}}
 #endif
   [a partialMethod];
   [b partialMethod];  // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
+  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
 #endif
   [a partial_proto_method];
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
+  // expected-warning@+2 {{'partial_proto_method' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'partial_proto_method' to silence this warning}}
 #endif
   [b partial_proto_method];
 }
@@ -89,7 +89,7 @@ void f_after_redecl(A *a, B *b) {
 
 @implementation D
 - (void) method {
-  [super method]; // expected-warning {{'method' is deprecated: first deprecated in OS X 10.2}}
+  [super method]; // expected-warning {{'method' is deprecated: first deprecated in macOS 10.2}}
 }
 @end
 
@@ -163,14 +163,14 @@ void partialfun(PartialI* a) {
   [a partialMethod]; // no warning
   [a ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
+  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
 #endif
   [a ipartialMethod2];
   [a ppartialMethod]; // no warning
   [PartialI partialMethod]; // no warning
   [PartialI ipartialMethod1]; // no warning
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
+  // expected-warning@+2 {{'ipartialMethod2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'ipartialMethod2' to silence this warning}}
 #endif
   [PartialI ipartialMethod2];
   [PartialI ppartialMethod]; // no warning
@@ -183,7 +183,7 @@ __attribute__((availability(macosx, introduced = 10.8))) @interface PartialI2
 @end
 
 #if defined(WARN_PARTIAL)
-  // expected-warning@+2 {{'PartialI2' is partial: introduced in OS X 10.8}} expected-note@+2 {{explicitly redeclare 'PartialI2' to silence this warning}}
+  // expected-warning@+2 {{'PartialI2' is partial: introduced in macOS 10.8}} expected-note@+2 {{explicitly redeclare 'PartialI2' to silence this warning}}
 #endif
 void partialinter1(PartialI2* p) {
 }
@@ -220,7 +220,7 @@ void use_myEnum() {
 @end
 
 void testAvailabilityP2(id<AvailabilityP2> obj) {
-  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in OS X 10.2}}
+  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in macOS 10.2}}
   [obj methodB]; // expected-error{{'methodB' is unavailable}}
 }
 
@@ -242,13 +242,13 @@ __attribute__((objc_root_class))
 -(void)methodA {
   // Make sure we're not inheriting availability.
   id<AvailabilityP2> obj = self;
-  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in OS X 10.2}}
+  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in macOS 10.2}}
   [obj methodB]; // expected-error{{'methodB' is unavailable}}
 }
 -(void)methodB {
   // Make sure we're not inheriting unavailability.
   id<AvailabilityP2> obj = self;
-  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in OS X 10.2}}
+  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in macOS 10.2}}
   [obj methodB]; // expected-error{{'methodB' is unavailable}}
 }
 
@@ -257,13 +257,13 @@ __attribute__((objc_root_class))
 void testImplementsAvailabilityP2b(ImplementsAvailabilityP2b *obj) {
   // still get warnings/errors because we see the protocol version.
 
-  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in OS X 10.2}}
+  [obj methodA]; // expected-warning{{'methodA' is deprecated: first deprecated in macOS 10.2}}
   [obj methodB]; // expected-error{{'methodB' is unavailable}}
 }
 
 __attribute__((objc_root_class))
 @interface ImplementsAvailabilityP2c <AvailabilityP2>
--(void)methodA __attribute__((availability(macosx,introduced=10.2))); // expected-warning{{method introduced after the protocol method it implements on OS X (10.2 vs. 10.1)}}
+-(void)methodA __attribute__((availability(macosx,introduced=10.2))); // expected-warning{{method introduced after the protocol method it implements on macOS (10.2 vs. 10.1)}}
 -(void)methodB __attribute__((unavailable));
 @end
 
@@ -272,7 +272,7 @@ __attribute__((objc_root_class))
 @end
 
 @implementation ImplementsAvailabilityP2d
--(void)methodA __attribute__((availability(macosx,introduced=10.2))) // expected-warning{{method introduced after the protocol method it implements on OS X (10.2 vs. 10.1)}}
+-(void)methodA __attribute__((availability(macosx,introduced=10.2))) // expected-warning{{method introduced after the protocol method it implements on macOS (10.2 vs. 10.1)}}
 {
 }
 -(void)methodB __attribute__((unavailable)) {
diff --git a/test/SemaObjC/attr-deprecated.m b/test/SemaObjC/attr-deprecated.m
index 14d33d3760518..59087bdf11547 100644
--- a/test/SemaObjC/attr-deprecated.m
+++ b/test/SemaObjC/attr-deprecated.m
@@ -235,7 +235,7 @@ expected-note {{property declared here}}
 
 id PID = 0;
 const char * func() {
-  return [PID cString]; // expected-warning {{'cString' is deprecated: first deprecated in OS X 10.4}}
+  return [PID cString]; // expected-warning {{'cString' is deprecated: first deprecated in macOS 10.4}}
 }
 
 // rdar://18960378
diff --git a/test/SemaObjC/attr-nodebug.m b/test/SemaObjC/attr-nodebug.m
new file mode 100644
index 0000000000000..7cf8e6cfbc88a
--- /dev/null
+++ b/test/SemaObjC/attr-nodebug.m
@@ -0,0 +1,5 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+@interface NSObject
+- (void)doSomething __attribute__((nodebug));
+@end
diff --git a/test/SemaObjC/attr-objc-runtime-visible.m b/test/SemaObjC/attr-objc-runtime-visible.m
new file mode 100644
index 0000000000000..b5ec809ff2f58
--- /dev/null
+++ b/test/SemaObjC/attr-objc-runtime-visible.m
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -verify -fsyntax-only  %s
+
+__attribute__((objc_runtime_visible))
+@interface A
+@end
+
+@interface A(X)
+@end
+
+@implementation A(X) // expected-error{{cannot implement a category for class 'A' that is only visible via the Objective-C runtime}}
+@end
+
+@interface B : A
+@end
+
+@implementation B // expected-error{{cannot implement subclass 'B' of a superclass 'A' that is only visible via the Objective-C runtime}}
+@end
+
+
diff --git a/test/SemaObjC/block-omitted-return-type.m b/test/SemaObjC/block-omitted-return-type.m
new file mode 100644
index 0000000000000..20e32e01865ef
--- /dev/null
+++ b/test/SemaObjC/block-omitted-return-type.m
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 %s -fblocks -verify -fsyntax-only
+
+@interface NSObject
+@end
+
+@interface Test : NSObject
+- (void)test;
+@end
+
+@implementation Test
+- (void)test
+{
+  void (^simpleBlock)() = ^ _Nonnull { //expected-warning {{attribute '_Nonnull' ignored, because it cannot be applied to omitted return type}}
+    return;
+  };
+  void (^simpleBlock2)() = ^ _Nonnull void { //expected-error {{nullability specifier '_Nonnull' cannot be applied to non-pointer type 'void'}}
+    return;
+  };
+  void (^simpleBlock3)() = ^ _Nonnull (void) {  //expected-warning {{attribute '_Nonnull' ignored, because it cannot be applied to omitted return type}}
+    return;
+  };
+
+  void (^simpleBlock4)() = ^ const { //expected-warning {{'const' qualifier on omitted return type '<dependent type>' has no effect}}
+    return;
+  };
+  void (^simpleBlock5)() = ^ const void { //expected-error {{incompatible block pointer types initializing 'void (^)()' with an expression of type 'const void (^)(void)'}}
+    return;
+  };
+  void (^simpleBlock6)() = ^ const (void) { //expected-warning {{'const' qualifier on omitted return type '<dependent type>' has no effect}}
+    return;
+  };
+  void (^simpleBlock7)() = ^ _Nonnull __attribute__((align_value(128))) _Nullable const (void) { // expected-warning {{attribute '_Nullable' ignored, because it cannot be applied to omitted return type}} \
+    // expected-warning {{attribute '_Nonnull' ignored, because it cannot be applied to omitted return type}} \
+    // expected-warning {{'const' qualifier on omitted return type '<dependent type>' has no effect}} \
+    // expected-warning {{'align_value' attribute only applies to variables and typedefs}}
+    return;
+  };
+  void (^simpleBlock9)() = ^ __attribute__ ((align_value(128))) _Nonnull const (void) { // expected-warning {{attribute '_Nonnull' ignored, because it cannot be applied to omitted return type}} \
+    // expected-warning {{'const' qualifier on omitted return type '<dependent type>' has no effect}} \
+    // expected-warning {{'align_value' attribute only applies to variables and typedefs}}
+    return;
+  };
+}
+@end
diff --git a/test/SemaObjC/dllexport.m b/test/SemaObjC/dllexport.m
new file mode 100644
index 0000000000000..e90b982c15c04
--- /dev/null
+++ b/test/SemaObjC/dllexport.m
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+enum __declspec(dllexport) E { Val };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+struct __declspec(dllexport) Record {};
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, and Objective-C interfaces}}
+
+__declspec(dllexport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllexport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjC/dllimport.m b/test/SemaObjC/dllimport.m
new file mode 100644
index 0000000000000..b8360773c69b3
--- /dev/null
+++ b/test/SemaObjC/dllimport.m
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+enum __declspec(dllimport) E { Val };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+struct __declspec(dllimport) Record {};
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, and Objective-C interfaces}}
+
+__declspec(dllimport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllimport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjC/enum-fixed-type.m b/test/SemaObjC/enum-fixed-type.m
index c00e45a03ed0b..37d2810a504b0 100644
--- a/test/SemaObjC/enum-fixed-type.m
+++ b/test/SemaObjC/enum-fixed-type.m
@@ -38,3 +38,8 @@ int arr3[(long long)Bar == (long long)-1 ? 1 : -1];
 
 typedef enum : Integer { BaseElem } BaseEnum;
 typedef enum : BaseEnum { DerivedElem } DerivedEnum; // expected-error {{non-integral type 'BaseEnum' is an invalid underlying type}}
+
+// <rdar://problem/24999533>
+enum MyEnum : _Bool {
+  MyThing = 0
+};
diff --git a/test/SemaObjC/format-strings-objc.m b/test/SemaObjC/format-strings-objc.m
index 079460cc76cb6..d81f166a6540b 100644
--- a/test/SemaObjC/format-strings-objc.m
+++ b/test/SemaObjC/format-strings-objc.m
@@ -116,6 +116,7 @@ NSString *test_literal_propagation(void) {
   NSLog(ns2); // expected-warning {{more '%' conversions than data arguments}}
   NSString * ns3 = ns1;
   NSLog(ns3); // expected-warning {{format string is not a string literal}}}
+  // expected-note@-1{{treat the string as an argument to avoid this}}
 
   NSString * const ns6 = @"split" " string " @"%s"; // expected-note {{format string is defined here}}
   NSLog(ns6); // expected-warning {{more '%' conversions than data arguments}}
@@ -263,4 +264,3 @@ void testObjCModifierFlags() {
   NSLog(@"%2$[tt]@ %1$[tt]@", @"Foo", @"Bar"); // no-warning
   NSLog(@"%2$[tt]@ %1$[tt]s", @"Foo", @"Bar"); // expected-warning {{object format flags cannot be used with 's' conversion specifier}}
 }
-
diff --git a/test/SemaObjC/format-strings-utf8.m b/test/SemaObjC/format-strings-utf8.m
new file mode 100644
index 0000000000000..d4c21b11ea722
--- /dev/null
+++ b/test/SemaObjC/format-strings-utf8.m
@@ -0,0 +1,45 @@
+// REQUIRES: system-darwin
+// RUN: rm -f %t.log
+// RUN: env RC_DEBUG_OPTIONS=1 \
+// RUN:     CC_LOG_DIAGNOSTICS=1 CC_LOG_DIAGNOSTICS_FILE=%t.log \
+// RUN: %clang -target x86_64-apple-darwin -fsyntax-only %s
+// RUN: FileCheck %s < %t.log
+
+#include <stdarg.h>
+int printf(const char *restrict, ...);
+int scanf(const char * restrict, ...);
+@class NSString, Protocol;
+extern void NSLog(NSString *format, ...);
+
+void testInvalidNoPrintable(int *a) {
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\xe2&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\xe2&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\u25b9&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\U00010348&apos;</string>
+  // CHECK: <string>invalid conversion specifier &apos;\xe2&apos;</string>
+  printf("%\u25B9");
+  printf("%\xE2\x96\xB9");
+  printf("%\U00010348");
+  printf("%\xF0\x90\x8D\x88");
+  printf("%\xe2");
+  NSLog(@"%\u25B9");
+  NSLog(@"%\xE2\x96\xB9");
+  NSLog(@"%\U00010348");
+  NSLog(@"%\xF0\x90\x8D\x88");
+  NSLog(@"%\xe2");
+  scanf("%\u25B9", a);
+  scanf("%\xE2\x96\xB9", a);
+  scanf("%\U00010348", a);
+  scanf("%\xF0\x90\x8D\x88", a);
+  scanf("%\xe2", a);
+}
diff --git a/test/SemaObjC/iboutlet.m b/test/SemaObjC/iboutlet.m
index 597c4e4628728..7bd86d4668b3a 100644
--- a/test/SemaObjC/iboutlet.m
+++ b/test/SemaObjC/iboutlet.m
@@ -1,7 +1,6 @@
-// RUN: %clang_cc1 -fsyntax-only -fobjc-arc -Wno-objc-root-class -Wreceiver-is-weak -Warc-repeated-use-of-weak -fobjc-runtime-has-weak -verify %s
-// RUN: %clang_cc1 -x objective-c++ -fsyntax-only -fobjc-arc -Wno-objc-root-class -Wreceiver-is-weak -Warc-repeated-use-of-weak -fobjc-runtime-has-weak -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fobjc-arc -Wno-objc-root-class -Warc-repeated-use-of-weak -fobjc-runtime-has-weak -verify %s
+// RUN: %clang_cc1 -x objective-c++ -fsyntax-only -fobjc-arc -Wno-objc-root-class -Warc-repeated-use-of-weak -fobjc-runtime-has-weak -verify %s
 // rdar://11448209
-// rdar://20259376
 
 #define READONLY readonly
 
@@ -42,6 +41,7 @@ IBInspectable @property (readonly) IBOutlet NSView *myView1; // expected-warning
 
 // rdar://15885642
 @interface WeakOutlet 
+@property int Number;
 @property IBOutlet __weak WeakOutlet* WeakProp;
 @end
 
@@ -51,3 +51,9 @@ WeakOutlet* func() {
   pwi.WeakProp = pwi.WeakProp;
   return pwi.WeakProp;
 }
+
+WeakOutlet* func2(WeakOutlet* pwi) {
+  [[pwi WeakProp] setNumber:0];
+  [[pwi WeakProp] setNumber:1];
+  return [pwi WeakProp];
+}
diff --git a/test/SemaObjC/kindof.m b/test/SemaObjC/kindof.m
index f205e68ea128a..63ba18fe89bc0 100644
--- a/test/SemaObjC/kindof.m
+++ b/test/SemaObjC/kindof.m
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fblocks -fsyntax-only %s -verify
+// RUN: %clang_cc1 -fblocks -fsyntax-only %s -verify -Wmethod-signatures
 
 // Tests Objective-C 'kindof' types.
 
@@ -24,7 +24,8 @@ __attribute__((objc_root_class))
 - (NSObject *)retain;
 @end
 
-@interface NSString : NSObject <NSCopying>
+@interface NSString : NSObject <NSCopying> // expected-note{{receiver is instance of class declared here}}
+- (void)compare:(NSString *)string;
 - (NSString *)stringByAppendingString:(NSString *)string;
 + (instancetype)string;
 @end
@@ -186,6 +187,39 @@ void test_crosscast_conversions(void) {
   NSString_obj = kindof_NSNumber_obj; // expected-warning{{from '__kindof NSNumber *'}}
 }
 
+@interface NSCell : NSObject
+@end
+@interface NSCellSub : NSCell
+@end
+@interface NSCellSub2 : NSCell
+@end
+@interface NSCellSubSub : NSCellSub
+@end
+
+typedef signed char BOOL;
+void test_conditional(BOOL flag) {
+  NSCellSubSub *result;
+  __kindof NSCellSub *kindof_Sub;
+  NSCell *cell;
+  NSCellSub *sub;
+  NSCellSub2 *sub2;
+  NSCellSubSub *subsub;
+
+  // LHS is kindof NSCellSub, RHS is NSCell --> kindof NSCell
+  // LHS is kindof NSCellSub, RHS is NSCellSub --> kindof NSCellSub
+  // LHS is kindof NSCellSub, RHS is NSCellSub2 --> kindof NSCell
+  // LHS is kindof NSCellSub, RHS is NSCellSubSub --> kindof NSCellSub
+  result = flag ? kindof_Sub : cell;
+  result = flag ? kindof_Sub : sub;
+  result = flag ? kindof_Sub : sub2;
+  result = flag ? kindof_Sub : subsub;
+
+  result = flag ? cell : kindof_Sub;
+  result = flag ? sub : kindof_Sub;
+  result = flag ? sub2 : kindof_Sub;
+  result = flag ? subsub : kindof_Sub;
+}
+
 // ---------------------------------------------------------------------------
 // Blocks
 // ---------------------------------------------------------------------------
@@ -246,7 +280,7 @@ void message_kindof_object(__kindof NSString *kindof_NSString) {
   [kindof_NSString retain]; // in superclass
   [kindof_NSString stringByAppendingString:0]; // in class
   [kindof_NSString appendString:0]; // in subclass
-  [kindof_NSString numberByAddingNumber: 0]; // FIXME: in unrelated class
+  [kindof_NSString numberByAddingNumber: 0]; // expected-warning{{instance method '-numberByAddingNumber:' not found (return type defaults to 'id')}}
   [kindof_NSString randomMethod]; // in protocol
 }
 
@@ -263,6 +297,43 @@ void message_kindof_qualified_class(
   [kindof_NSCopying randomClassMethod]; // in unrelated protocol
 }
 
+// Make sure we don't emit warning about multiple methods found.
+typedef int NSInteger;
+@interface Foo : NSObject
+- (NSString*)test;
+@end
+@interface Bar : NSObject
+- (NSInteger)test;
+@end
+void test(__kindof Bar *kBar) {
+    [kBar test];
+}
+
+// Make sure we don't emit warning about no method found.
+@interface A : NSObject
+@property (readonly, getter=isActive) BOOL active;
+@end
+@interface B : NSObject
+@property (getter=isActive, readonly) BOOL active;
+@end
+void foo() {
+  __kindof B *NSApp;
+  if ([NSApp isActive]) {
+  }
+}
+
+typedef const struct CGPath *CGPathRef;
+@interface C : NSObject
+@property (copy) NSString *path;
+@end
+@interface D : NSObject
+@property CGPathRef path __attribute__((availability(macosx,unavailable)));
+@end
+// Make sure we choose "NSString *path" for [s1 path].
+void bar(id s1, id s2) {
+  return [[s1 path] compare:[s2 path]];
+}
+
 // ---------------------------------------------------------------------------
 // __kindof within specialized types
 // ---------------------------------------------------------------------------
@@ -303,6 +374,27 @@ void testNullability() {
   processCopyable(0); // expected-warning{{null passed to a callee that requires a non-null argument}}
 }
 
+// Make sure that we don't emit a warning about conflicting parameter types
+// between __kindof id and id.
+@interface A2 : NSObject
+- (void)test:(__kindof id)T;
+@end
+@implementation A2
+- (void)test:(id)T {
+}
+@end
+
+@interface NSGeneric<ObjectType> : NSObject
+- (void)test:(__kindof ObjectType)T;
+- (void)mapUsingBlock:(id (^)(__kindof ObjectType))block;
+@end
+@implementation NSGeneric
+- (void)test:(id)T {
+}
+- (void)mapUsingBlock:(id (^)(id))block {
+}
+@end
+
 // Check that clang doesn't crash when a type parameter is illegal.
 @interface Array1<T> : NSObject
 @end
diff --git a/test/SemaObjC/method-warn-unused-attribute.m b/test/SemaObjC/method-warn-unused-attribute.m
index 042f4422f8083..b83dabf3bbdc2 100644
--- a/test/SemaObjC/method-warn-unused-attribute.m
+++ b/test/SemaObjC/method-warn-unused-attribute.m
@@ -9,8 +9,8 @@
 
 void foo(INTF *a) {
   [a garf];
-  [a fee]; // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
-  [INTF c]; // expected-warning {{ignoring return value of function declared with warn_unused_result attribute}}
+  [a fee]; // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
+  [INTF c]; // expected-warning {{ignoring return value of function declared with 'warn_unused_result' attribute}}
 }
 
 
diff --git a/test/SemaObjC/objc-class-property.m b/test/SemaObjC/objc-class-property.m
new file mode 100644
index 0000000000000..56285976e1941
--- /dev/null
+++ b/test/SemaObjC/objc-class-property.m
@@ -0,0 +1,59 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#if !__has_feature(objc_class_property)
+#error does not support class property
+#endif
+
+@interface Root
+-(id) alloc;
+-(id) init;
+@end
+
+@interface A : Root {
+  int x;
+  int z;
+}
+@property int x;
+@property int y;
+@property int z;
+@property(readonly) int ro, ro2;
+@property (class) int c;
+@property (class) int c2; // expected-note {{property declared here}} \
+                          // expected-note {{property declared here}}
+@property (class) int x;
+@end
+
+@implementation A // expected-warning {{class property 'c2' requires method 'c2' to be defined}} \
+                  // expected-warning {{class property 'c2' requires method 'setC2:' to be defined}}
+@dynamic x; // refers to the instance property
+@dynamic (class) x; // refers to the class property
+@synthesize z, c2; // expected-error {{@synthesize not allowed on a class property 'c2'}}
+@dynamic c; // refers to the class property
+@end
+
+int test() {
+  A *a = [[A alloc] init];
+  a.c; // expected-error {{property 'c' is a class property; did you mean to access it with class 'A'}}
+  return a.x + A.c;
+}
+
+void message_id(id me) {
+  [me y];
+}
+
+void message_class(Class me) {
+  [me c2];
+}
+
+@interface NSObject
+@end
+
+@interface MyClass : NSObject
+@property(class, readonly) int classProp; // expected-note {{property declared here}}
+@end
+
+@implementation MyClass // expected-warning {{class property 'classProp' requires method 'classProp' to be defined}}
+- (int)classProp { // Oops, mistakenly made this an instance method.
+  return 8;
+}
+@end
diff --git a/test/SemaObjC/objcbridge-attribute-arc.m b/test/SemaObjC/objcbridge-attribute-arc.m
index 3bcfdf48e7946..26dbce09b8164 100644
--- a/test/SemaObjC/objcbridge-attribute-arc.m
+++ b/test/SemaObjC/objcbridge-attribute-arc.m
@@ -23,7 +23,10 @@ typedef struct __CFSetRef * CFSetRef __attribute__((objc_bridge(NSSet))); // exp
 
 typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
 
-typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
+// This error requires C11.
+#if __STDC_VERSION__ > 199901L
+typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) * CFUColorRef; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
+#endif
 
 typedef union __CFUColor __attribute__((objc_bridge(NSUColor))) *CFUColor1Ref; // expected-error {{parameter of 'objc_bridge' attribute must be 'id' when used on a typedef}}
 
diff --git a/test/SemaObjC/ovl-check.m b/test/SemaObjC/ovl-check.m
index 7fc8a64049116..cc2f094ad2ead 100644
--- a/test/SemaObjC/ovl-check.m
+++ b/test/SemaObjC/ovl-check.m
@@ -4,20 +4,24 @@
 // in overload resolution in ObjC.
 
 struct Type1 { int a; };
+typedef const __attribute__((objc_bridge(id))) void * CFTypeRef;
 @interface Iface1 @end
 
 @interface NeverCalled
 - (void) test:(struct Type1 *)arg;
+- (void) test2:(CFTypeRef)arg;
 @end
 
 @interface TakesIface1
 - (void) test:(Iface1 *)arg;
+- (void) test2:(Iface1 *)arg;
 @end
 
 // PR26085, rdar://problem/24111333
 void testTakesIface1(id x, Iface1 *arg) {
   // This should resolve silently to `TakesIface1`.
   [x test:arg];
+  [x test2:arg];
 }
 
 @class NSString;
@@ -36,17 +40,16 @@ void testTakesNSString(id x) {
   [x testStr:"someStringLiteral"];
 }
 
-typedef const void *CFTypeRef;
 id CreateSomething();
 
-@interface NeverCalledv3
-- (void) testCFTypeRef:(struct Type1 *)arg;
-@end
-
 @interface TakesCFTypeRef
 - (void) testCFTypeRef:(CFTypeRef)arg;
 @end
 
+@interface NeverCalledv3
+- (void) testCFTypeRef:(struct Type1 *)arg;
+@end
+
 // Not called out explicitly by PR26085, but related.
 void testTakesCFTypeRef(id x) {
   // Overload resolution should occur silently, select the CFTypeRef overload,
diff --git a/test/SemaObjC/parameterized_classes.m b/test/SemaObjC/parameterized_classes.m
index 7f380a10547ae..1004fd3c4b1eb 100644
--- a/test/SemaObjC/parameterized_classes.m
+++ b/test/SemaObjC/parameterized_classes.m
@@ -240,6 +240,10 @@ typedef PC4<NSObject, NSObject, ObjCStringref> typeArgs5; // expected-error{{unk
 
 // Type/protocol conflict.
 typedef PC4<NSCopying, ObjCStringRef> typeArgsProtocolQualsConflict1; // expected-error{{angle brackets contain both a type ('ObjCStringRef') and a protocol ('NSCopying')}}
+typedef PC4<NSCopying, NSString *> typeArgsProtocolQualsConflict2; // expected-error{{angle brackets contain both a type ('NSString') and a protocol ('NSCopying')}}
+typedef PC4<NSCopying, UnknownType, NSString *> typeArgsProtocolQualsConflict3; // expected-error{{angle brackets contain both a type ('NSString') and a protocol ('NSCopying')}} expected-error{{unknown type name 'UnknownType'}}
+typedef PC4<UnknownType, NSString *> typeArgsProtocolQualsConflict4; // expected-error{{unknown type name 'UnknownType'}}
+typedef PC4<NSString, NSCopying, NSString *> typeArgsProtocolQualsConflict5; // expected-error{{angle brackets contain both a type ('NSString') and a protocol ('NSCopying')}}
 
 // Handling the '>>' in type argument lists.
 typedef PC4<id<NSCopying>, NSObject *, id<NSObject>> typeArgs6;
diff --git a/test/SemaObjC/property-atomic-bool.m b/test/SemaObjC/property-atomic-bool.m
new file mode 100644
index 0000000000000..4110b5e044590
--- /dev/null
+++ b/test/SemaObjC/property-atomic-bool.m
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.10 -ast-dump "%s" | FileCheck %s
+
+// CHECK: TypedefDecl {{.*}} referenced AtomicBool '_Atomic(_Bool)'
+// CHECK:  AtomicType {{.*}} '_Atomic(_Bool)'
+// CHECK:   BuiltinType {{.*}} '_Bool'
+// CHECK: ObjCInterfaceDecl {{.*}} A0
+// CHECK:  ObjCPropertyDecl {{.*}} p '_Atomic(_Bool)' {{.*}} nonatomic
+// CHECK:  ObjCMethodDecl {{.*}} implicit - p '_Bool'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - setP: 'void'
+// CHECK:   ParmVarDecl {{.*}} p '_Bool'
+// CHECK: ObjCInterfaceDecl {{.*}} A1
+// CHECK:  ObjCPropertyDecl {{.*}} p 'AtomicBool':'_Atomic(_Bool)' {{.*}} nonatomic
+// CHECK:  ObjCMethodDecl {{.*}} implicit - p '_Bool'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - setP: 'void'
+// CHECK:   ParmVarDecl {{.*}} p '_Bool'
+// CHECK: ObjCInterfaceDecl {{.*}} A2
+// CHECK:  ObjCIvarDecl {{.*}} p '_Atomic(_Bool)' protected
+// CHECK:  ObjCPropertyDecl {{.*}} p '_Atomic(_Bool)'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - p '_Bool'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - setP: 'void'
+// CHECK:   ParmVarDecl {{.*}} p '_Bool'
+// CHECK: ObjCInterfaceDecl {{.*}} A3
+// CHECK:  ObjCIvarDecl {{.*}} p 'AtomicBool':'_Atomic(_Bool)' protected
+// CHECK:  ObjCPropertyDecl {{.*}} p 'AtomicBool':'_Atomic(_Bool)'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - p '_Bool'
+// CHECK:  ObjCMethodDecl {{.*}} implicit - setP: 'void'
+// CHECK:   ParmVarDecl {{.*}} p '_Bool'
+
+typedef _Atomic(_Bool) AtomicBool;
+
+@interface A0
+@property(nonatomic) _Atomic(_Bool) p;
+@end
+@implementation A0
+@end
+
+@interface A1
+@property(nonatomic) AtomicBool p;
+@end
+@implementation A1
+@end
+
+@interface A2 {
+  _Atomic(_Bool) p;
+}
+@property _Atomic(_Bool) p;
+@end
+
+@implementation A2
+@synthesize p;
+@end
+
+@interface A3 {
+  AtomicBool p;
+}
+@property AtomicBool p;
+@end
+
+@implementation A3
+@synthesize p;
+@end
diff --git a/test/SemaObjC/property-noninherited-availability-attr.m b/test/SemaObjC/property-noninherited-availability-attr.m
index dfa72d1077efb..abedcc237801f 100644
--- a/test/SemaObjC/property-noninherited-availability-attr.m
+++ b/test/SemaObjC/property-noninherited-availability-attr.m
@@ -20,8 +20,8 @@
 @end
 
 void test(Foo *y, Bar *x, id<myProtocol> z) {
-  y.myProperty = 0; // expected-warning {{'myProperty' is deprecated: first deprecated in OS X 10.8}}
-  (void)[y myProperty];   // expected-warning {{'myProperty' is deprecated: first deprecated in OS X 10.8}} 
+  y.myProperty = 0; // expected-warning {{'myProperty' is deprecated: first deprecated in macOS 10.8}}
+  (void)[y myProperty];   // expected-warning {{'myProperty' is deprecated: first deprecated in macOS 10.8}}
 
   x.myProperty = 1; // no-warning
   (void)[x myProperty]; // no-warning
@@ -29,5 +29,5 @@ void test(Foo *y, Bar *x, id<myProtocol> z) {
   x.myProtocolProperty = 0; // no-warning
 
   (void)[x myProtocolProperty]; // no-warning
-  (void)[z myProtocolProperty]; // expected-warning {{'myProtocolProperty' is deprecated: first deprecated in OS X 10.8}}
+  (void)[z myProtocolProperty]; // expected-warning {{'myProtocolProperty' is deprecated: first deprecated in macOS 10.8}}
 }
diff --git a/test/SemaObjC/typo-correction-arc.m b/test/SemaObjC/typo-correction-arc.m
new file mode 100644
index 0000000000000..206d545ae7e37
--- /dev/null
+++ b/test/SemaObjC/typo-correction-arc.m
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple i386-apple-macosx10.10 -fobjc-arc -fsyntax-only -Wno-objc-root-class %s -verify
+
+typedef unsigned long NSUInteger;
+
+id nameless;                                  // expected-note{{'nameless' declared here}}
+
+@interface NSArray
+- (instancetype)initWithObjects:(const id[])objects count:(NSUInteger)count;
+@end
+
+@interface I
+@property NSArray *array;
+- (id)getArrayById:(id)name;
+- (void)setArrayValue:(id)array;
+@end
+
+@interface J
+- (void)setArray:(id)array;
+- (void)setIvarArray;
+@end
+
+@implementation J {
+  I *i;
+}
+- (void)setArray:(id)array {  // expected-note{{'array' declared here}}
+  i.array = aray;             // expected-error{{use of undeclared identifier 'aray'; did you mean 'array'}}
+}
+- (void)setIvarArray {
+  [i setArrayValue:[i getArrayById:nameles]]; // expected-error{{use of undeclared identifier 'nameles'; did you mean 'nameless'}}
+}
+@end
+
diff --git a/test/SemaObjC/warn-loop-analysis.m b/test/SemaObjC/warn-loop-analysis.m
new file mode 100644
index 0000000000000..8ae7375f7f2d8
--- /dev/null
+++ b/test/SemaObjC/warn-loop-analysis.m
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fsyntax-only -Wloop-analysis -verify %s
+// expected-no-diagnostics
+
+@interface MyArray
+- (id)objectAtIndexedSubscript:(unsigned int)idx;
+@end
+
+// Do not warn on objc classes has objectAtIndexedSubscript method.
+MyArray *test;
+void foo()
+{
+  unsigned int i;
+  for (i = 42; i > 0;) // No warnings here
+    (void)test[--i];
+}
diff --git a/test/SemaObjC/warn-strict-selector-match.m b/test/SemaObjC/warn-strict-selector-match.m
index 13e9bac462fcb..6b92cb805df17 100644
--- a/test/SemaObjC/warn-strict-selector-match.m
+++ b/test/SemaObjC/warn-strict-selector-match.m
@@ -49,7 +49,7 @@ id foo(void) {
 @end
 
 @implementation NTGridDataObject
-- (id)initWithData:(id<MyObject, MyCoding>)data {
+- (id)initWithData:(id<MyObject, MyCoding>)data { // expected-note {{also found}}
   return data;
 }
 + (NTGridDataObject*)dataObject:(id<MyObject, MyCoding>)data
diff --git a/test/SemaObjCXX/arc-nsconsumed-errors.mm b/test/SemaObjCXX/arc-nsconsumed-errors.mm
index c78d8a5f4add4..638a1ebd2addb 100644
--- a/test/SemaObjCXX/arc-nsconsumed-errors.mm
+++ b/test/SemaObjCXX/arc-nsconsumed-errors.mm
@@ -29,9 +29,9 @@ void releaser(__attribute__((ns_consumed)) id);
 releaser_t r2 = releaser; // no-warning
 
 template <typename T>
-void templateFunction(T) { } // expected-note {{candidate template ignored: could not match 'void (__strong id)' against 'void (id)'}} \
+void templateFunction(T) { } // expected-note {{candidate template ignored: could not match 'void (__strong id)' against 'void (__attribute__((ns_consumed)) id)'}} \
                              // expected-note {{candidate template ignored: failed template argument deduction}}
-releaser_t r3 = templateFunction<id>; // expected-error {{address of overloaded function 'templateFunction' does not match required type 'void (id)'}}
+releaser_t r3 = templateFunction<id>; // expected-error {{address of overloaded function 'templateFunction' does not match required type 'void (__attribute__((ns_consumed)) id)'}}
 
 template <typename T>
 void templateReleaser(__attribute__((ns_consumed)) T) { } // expected-note 2{{candidate template ignored: failed template argument deduction}}
diff --git a/test/SemaObjCXX/arc-templates.mm b/test/SemaObjCXX/arc-templates.mm
index ebede6404f907..97854dff8c1ac 100644
--- a/test/SemaObjCXX/arc-templates.mm
+++ b/test/SemaObjCXX/arc-templates.mm
@@ -1,4 +1,7 @@
-// RUN: %clang_cc1 -fobjc-runtime-has-weak -fsyntax-only -fobjc-arc -verify -fblocks %s
+// RUN: %clang_cc1 -fobjc-runtime-has-weak -fsyntax-only -fobjc-arc -verify -fblocks -std=c++11 %s
+
+#define CONSUMED __attribute__((ns_consumed))
+#define PRODUCED __attribute__((ns_returns_retained))
 
 @interface A
 @end
@@ -318,3 +321,132 @@ namespace rdar15713945 {
     double &dr = (f)(unsafe);
   }
 }
+
+namespace consumed {
+  void take_yes_no(void (&)(id CONSUMED, id)); // expected-note 2 {{candidate function not viable}}
+  void take_no_yes(void (&)(id, CONSUMED id)); // expected-note 2 {{candidate function not viable}}
+  void take_yes_yes(void (&)(CONSUMED id, CONSUMED id)); // expected-note 2 {{candidate function not viable}}
+
+  template <class... As> void consumes_first(id CONSUMED, As...);
+  void test1() {
+    take_yes_no(consumes_first<id>);
+    take_no_yes(consumes_first<id>); // expected-error {{no matching function}}
+    take_yes_yes(consumes_first<id>); // expected-error {{no matching function}}
+  }
+
+  template <class... As> void consumes_rest(id, CONSUMED As...);
+  void test2() {
+    take_yes_no(consumes_rest<id>); // expected-error {{no matching function}}
+    take_no_yes(consumes_rest<id>);
+    take_yes_yes(consumes_rest<id>); // expected-error {{no matching function}}
+  }
+
+  template <class T, class U> void consumes_two(CONSUMED T, CONSUMED U);
+  void test3() {
+    take_yes_no(consumes_two); // expected-error {{no matching function}}
+    take_no_yes(consumes_two); // expected-error {{no matching function}}
+    take_yes_yes(consumes_two);
+  }
+}
+
+namespace consumed_nested {
+  void take_yes_no(void (&)(id CONSUMED, id)); // expected-note 4 {{candidate function not viable}}
+  void take_no_yes(void (&)(id, CONSUMED id)); // expected-note 4 {{candidate function not viable}}
+  void take_yes_yes(void (&)(CONSUMED id, CONSUMED id)); // expected-note 4 {{candidate function not viable}}
+
+  template <unsigned N> struct consumes_first {
+    template <class... As> static void fn(id CONSUMED, As...);
+  };
+  void test1() {
+    take_yes_no(consumes_first<1>::fn<id>);
+    take_no_yes(consumes_first<2>::fn<id>); // expected-error {{no matching function}}
+    take_yes_yes(consumes_first<3>::fn<id>); // expected-error {{no matching function}}
+    take_yes_no(consumes_first<4>::fn);
+    take_no_yes(consumes_first<5>::fn); // expected-error {{no matching function}}
+    take_yes_yes(consumes_first<6>::fn); // expected-error {{no matching function}}
+  }
+
+  template <unsigned N> struct consumes_rest {
+    template <class... As> static void fn(id, CONSUMED As...);
+  };
+  void test2() {
+    take_yes_no(consumes_rest<1>::fn<id>); // expected-error {{no matching function}}
+    take_no_yes(consumes_rest<2>::fn<id>);
+    take_yes_yes(consumes_rest<3>::fn<id>); // expected-error {{no matching function}}
+    take_yes_no(consumes_rest<4>::fn<id>); // expected-error {{no matching function}}
+    take_no_yes(consumes_rest<5>::fn<id>);
+    take_yes_yes(consumes_rest<6>::fn<id>); // expected-error {{no matching function}}
+  }
+
+  template <unsigned N> struct consumes_two {
+    template <class T, class U> static void fn(CONSUMED T, CONSUMED U);
+  };
+  void test3() {
+    take_yes_no(consumes_two<1>::fn<id, id>); // expected-error {{no matching function}}
+    take_no_yes(consumes_two<2>::fn<id, id>); // expected-error {{no matching function}}
+    take_yes_yes(consumes_two<3>::fn<id, id>);
+    take_yes_no(consumes_two<1>::fn); // expected-error {{no matching function}}
+    take_no_yes(consumes_two<2>::fn); // expected-error {{no matching function}}
+    take_yes_yes(consumes_two<3>::fn);
+  }
+}
+
+namespace produced {
+  void take_yes(PRODUCED id (&)()); // expected-note 2 {{candidate function not viable}}
+  void take_no(id (&)()); // expected-note 2 {{candidate function not viable}}
+
+  template <class T> T non_produces1();
+  template <class T> T non_produces2();
+  template <class T> T non_produces3();
+  template <class T> T non_produces4();
+  void test1() {
+    take_yes(non_produces1<id>); // expected-error {{no matching function}}
+    take_yes(non_produces2); // expected-error {{no matching function}}
+    take_no(non_produces3<id>);
+    take_no(non_produces4);
+  }
+
+  template <class T> PRODUCED T produces1();
+  template <class T> PRODUCED T produces2();
+  template <class T> PRODUCED T produces3();
+  template <class T> PRODUCED T produces4();
+  void test2() {
+    take_yes(produces1<id>);
+    take_yes(produces2);
+    take_no(produces3<id>); // expected-error {{no matching function}}
+    take_no(produces4); // expected-error {{no matching function}}
+  }
+}
+
+namespace produced_nested {
+  void take_yes(PRODUCED id (&)()); // expected-note 2 {{candidate function not viable}}
+  void take_no(id (&)()); // expected-note 2 {{candidate function not viable}}
+
+  template <unsigned N> struct non_produces {
+    template <class T> static T fn();
+  };
+  void test1() {
+    take_yes(non_produces<1>::fn<id>); // expected-error {{no matching function}}
+    take_yes(non_produces<2>::fn); // expected-error {{no matching function}}
+    take_no(non_produces<3>::fn<id>);
+    take_no(non_produces<4>::fn);
+  }
+
+  template <unsigned N> struct produces {
+    template <class T> static PRODUCED T fn();
+  };
+  void test2() {
+    take_yes(produces<1>::fn<id>);
+    take_yes(produces<2>::fn);
+    take_no(produces<3>::fn<id>); // expected-error {{no matching function}}
+    take_no(produces<4>::fn); // expected-error {{no matching function}}
+  }
+}
+
+namespace instantiate_consumed {
+  template <class T> void take(CONSUMED T t) {} // expected-note {{candidate template ignored: substitution failure [with T = int]: ns_consumed attribute only applies to Objective-C object parameters}}
+  void test() {
+    take((id) 0);
+    take((int) 0); // expected-error {{no matching function for call to 'take'}}
+  }
+}
diff --git a/test/SemaObjCXX/base-type-as-written.mm b/test/SemaObjCXX/base-type-as-written.mm
new file mode 100644
index 0000000000000..05962e3f288c3
--- /dev/null
+++ b/test/SemaObjCXX/base-type-as-written.mm
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// Make sure we don't crash in TreeTransform<Derived>::TransformObjCObjectType.
+
+@protocol P1
+@end
+
+template <class T1><P1> foo1(T1) { // expected-warning {{protocol has no object type specified; defaults to qualified 'id'}}
+  foo1(0);
+}
diff --git a/test/SemaObjCXX/block-cleanup.mm b/test/SemaObjCXX/block-cleanup.mm
new file mode 100644
index 0000000000000..0c6a6d8c26fae
--- /dev/null
+++ b/test/SemaObjCXX/block-cleanup.mm
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple x86_64-apple-macosx10.11.0 -std=gnu++11 -o /dev/null -x objective-c++ -fblocks -ast-dump %s 2>&1 | FileCheck %s
+
+// CHECK:      -FunctionDecl {{.*}} test 'id (void)'
+// CHECK-NEXT:   -CompoundStmt
+// CHECK-NEXT:     -ReturnStmt
+// CHECK-NEXT:       -ExprWithCleanups
+// CHECK-NEXT:         -cleanup Block
+// CHECK-NEXT:         -cleanup Block
+
+@interface NSDictionary
++ (id)dictionaryWithObjects:(const id [])objects forKeys:(const id [])keys count:(unsigned long)cnt;
+@end
+
+id test() {
+  return @{@"a": [](){}, @"b": [](){}};
+}
diff --git a/test/SemaObjCXX/block-for-lambda-conversion.mm b/test/SemaObjCXX/block-for-lambda-conversion.mm
new file mode 100644
index 0000000000000..671e83dc22019
--- /dev/null
+++ b/test/SemaObjCXX/block-for-lambda-conversion.mm
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 -fsyntax-only -fblocks -verify -std=c++11 %s
+
+enum NSEventType {
+  NSEventTypeFlagsChanged = 12
+};
+
+enum NSEventMask {
+  NSEventMaskLeftMouseDown = 1
+};
+
+static const NSEventType NSFlagsChanged = NSEventTypeFlagsChanged;
+
+@interface NSObject
+@end
+@interface NSEvent : NSObject {
+}
++ (nullable id)
+addMonitor:(NSEventMask)mask handler:(NSEvent *_Nullable (^)(NSEvent *))block;
+@end
+
+void test(id weakThis) {
+  id m_flagsChangedEventMonitor = [NSEvent
+      addMonitor:NSFlagsChangedMask //expected-error {{use of undeclared identifier 'NSFlagsChangedMask'}}
+         handler:[weakThis](NSEvent *flagsChangedEvent) {
+             return flagsChangedEvent;
+         }];
+}
diff --git a/test/SemaObjCXX/dllexport.mm b/test/SemaObjCXX/dllexport.mm
new file mode 100644
index 0000000000000..739749f641927
--- /dev/null
+++ b/test/SemaObjCXX/dllexport.mm
@@ -0,0 +1,33 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllexport) typedef int typedef1;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllexport) int typedef2;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef int __declspec(dllexport) typedef3;
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllexport) void (*FunTy)();
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+enum __declspec(dllexport) E { };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#if __has_feature(cxx_strong_enums)
+enum class __declspec(dllexport) F { };
+// expected-warning@-1{{'dllexport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#endif
+
+__declspec(dllexport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllexport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
+
diff --git a/test/SemaObjCXX/dllimport.mm b/test/SemaObjCXX/dllimport.mm
new file mode 100644
index 0000000000000..4c348c4841862
--- /dev/null
+++ b/test/SemaObjCXX/dllimport.mm
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple i686-windows -fdeclspec -fsyntax-only -verify %s
+
+__declspec(dllimport) typedef int typedef1;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllimport) int typedef2;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef int __declspec(dllimport) typedef3;
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+typedef __declspec(dllimport) void (*FunTy)();
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+enum __declspec(dllimport) E { };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#if __has_feature(cxx_strong_enums)
+enum class __declspec(dllimport) F { };
+// expected-warning@-1{{'dllimport' attribute only applies to functions, variables, classes, and Objective-C interfaces}}
+#endif
+
+__declspec(dllimport)
+__attribute__((__objc_root_class__))
+@interface NSObject
+@end
+
+__declspec(dllimport)
+@interface I : NSObject
+- (void)method;
+@end
+
+@implementation I
+- (void)method {
+}
+@end
+
diff --git a/test/SemaObjCXX/foreach.mm b/test/SemaObjCXX/foreach.mm
index d1302c19a58c3..99f5d0ce55ee8 100644
--- a/test/SemaObjCXX/foreach.mm
+++ b/test/SemaObjCXX/foreach.mm
@@ -6,10 +6,8 @@
 void f(NSArray *a) {
     id keys;
     for (int i : a); // expected-error{{selector element type 'int' is not a valid object}} 
-    for ((id)2 : a);  // expected-error {{for range declaration must declare a variable}} \
-                      // expected-warning {{expression result unused}}
-    for (2 : a); // expected-error {{for range declaration must declare a variable}} \
-                 // expected-warning {{expression result unused}}
+    for ((id)2 : a);  // expected-error {{for range declaration must declare a variable}}
+    for (2 : a); // expected-error {{for range declaration must declare a variable}}
   
   for (id thisKey : keys);
 
@@ -65,8 +63,7 @@ int main ()
 @end
 void test2(NSObject<NSFastEnumeration> *collection) {
   Test2 *obj;
-  for (obj.prop : collection) { // expected-error {{for range declaration must declare a variable}} \
-                                // expected-warning {{property access result unused - getters should not be used for side effects}}
+  for (obj.prop : collection) { // expected-error {{for range declaration must declare a variable}}
   }
 }
 
diff --git a/test/SemaObjCXX/instancetype.mm b/test/SemaObjCXX/instancetype.mm
index 89ff2b4b03266..f61d6bf4cfc81 100644
--- a/test/SemaObjCXX/instancetype.mm
+++ b/test/SemaObjCXX/instancetype.mm
@@ -214,3 +214,10 @@ void test_instancetype_inherited() {
   return 0;
 }
 @end
+
+// PR27822
+@class NSString;
+namespace pr27822 { }
+@interface AXPlatformNodeCocoa
++ (NSString*)nativeRoleFromAXRole:(pr27822::UndeclaredIdentifier)role; // expected-error {{expected a type}}
+@end
diff --git a/test/SemaObjCXX/typeloc-data-alignment.mm b/test/SemaObjCXX/typeloc-data-alignment.mm
new file mode 100644
index 0000000000000..e17a910a60709
--- /dev/null
+++ b/test/SemaObjCXX/typeloc-data-alignment.mm
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// Make sure this doesn't crash.
+
+@protocol P
+@end
+template <class T>
+id<P> foo(T) {
+  int i;
+  foo(i);
+}
diff --git a/test/SemaObjCXX/typo-correction.mm b/test/SemaObjCXX/typo-correction.mm
new file mode 100644
index 0000000000000..a34a7901e8eeb
--- /dev/null
+++ b/test/SemaObjCXX/typo-correction.mm
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only
+
+class ClassA {};
+
+class ClassB {
+public:
+  ClassB(ClassA* parent=0);
+  ~ClassB();
+};
+
+@interface NSObject
+@end
+
+@interface InterfaceA : NSObject
+@property(nonatomic, assign) ClassA *m_prop1; // expected-note {{here}}
+@property(nonatomic, assign) ClassB *m_prop2;
+@end
+
+@implementation InterfaceA
+- (id)test {
+  self.m_prop2 = new ClassB(m_prop1); // expected-error {{use of undeclared identifier 'm_prop1'; did you mean '_m_prop1'?}}
+}
+@end
diff --git a/test/SemaOpenCL/access-qualifier.cl b/test/SemaOpenCL/access-qualifier.cl
new file mode 100644
index 0000000000000..7e5c70f915f64
--- /dev/null
+++ b/test/SemaOpenCL/access-qualifier.cl
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -verify -pedantic -fsyntax-only -cl-std=CL1.2 %s
+// RUN: %clang_cc1 -verify -pedantic -fsyntax-only -cl-std=CL2.0 %s
+
+typedef image1d_t img1d_ro_default; // expected-note {{previously declared 'read_only' here}}
+
+typedef write_only image1d_t img1d_wo; // expected-note {{previously declared 'write_only' here}}
+typedef read_only image1d_t img1d_ro;
+
+#if __OPENCL_C_VERSION__ >= 200
+typedef read_write image1d_t img1d_rw;
+#endif
+
+typedef int Int;
+typedef read_only int IntRO; // expected-error {{access qualifier can only be used for pipe and image type}}
+
+
+void myWrite(write_only image1d_t); // expected-note {{passing argument to parameter here}} expected-note {{passing argument to parameter here}}
+void myRead(read_only image1d_t); // expected-note {{passing argument to parameter here}}
+
+#if __OPENCL_C_VERSION__ >= 200
+void myReadWrite(read_write image1d_t);
+#else
+void myReadWrite(read_write image1d_t); // expected-error {{access qualifier 'read_write' can not be used for '__read_write image1d_t' prior to OpenCL version 2.0}}
+#endif
+
+
+kernel void k1(img1d_wo img) {
+  myRead(img); // expected-error {{passing 'img1d_wo' (aka '__write_only image1d_t') to parameter of incompatible type '__read_only image1d_t'}}
+}
+
+kernel void k2(img1d_ro img) {
+  myWrite(img); // expected-error {{passing 'img1d_ro' (aka '__read_only image1d_t') to parameter of incompatible type '__write_only image1d_t'}}
+}
+
+kernel void k3(img1d_wo img) {
+  myWrite(img);
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+kernel void k4(img1d_rw img) {
+  myReadWrite(img);
+}
+#endif
+
+kernel void k5(img1d_ro_default img) {
+  myWrite(img); // expected-error {{passing 'img1d_ro_default' (aka '__read_only image1d_t') to parameter of incompatible type '__write_only image1d_t'}}
+}
+
+kernel void k6(img1d_ro img) {
+  myRead(img);
+}
+
+kernel void k7(read_only img1d_wo img){} // expected-error {{multiple access qualifiers}}
+
+kernel void k8(write_only img1d_ro_default img){} // expected-error {{multiple access qualifiers}}
+
+kernel void k9(read_only int i){} // expected-error{{access qualifier can only be used for pipe and image type}}
+
+kernel void k10(read_only Int img){} // expected-error {{access qualifier can only be used for pipe and image type}}
+
+kernel void k11(read_only write_only image1d_t i){} // expected-error{{multiple access qualifiers}}
+
+kernel void k12(read_only read_only image1d_t i){} // expected-error{{multiple access qualifiers}}
+
+#if __OPENCL_C_VERSION__ >= 200
+kernel void k13(read_write pipe int i){} // expected-error{{access qualifier 'read_write' can not be used for 'pipe int'}}
+#else
+kernel void k13(__read_write image1d_t i){} // expected-error{{access qualifier '__read_write' can not be used for '__read_write image1d_t' prior to OpenCL version 2.0}}
+#endif
diff --git a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
index 50363f23a9004..97fd07a24a24b 100644
--- a/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
+++ b/test/SemaOpenCL/address-spaces-conversions-cl2.0.cl
@@ -225,3 +225,69 @@ void test_conversion(global int *arg_glob, local int *arg_loc,
 // expected-error@-2{{passing '__constant int *' to parameter of type '__generic int *' changes address space of pointer}}
 #endif
 }
+
+void test_ternary() {
+  AS int *var_cond;
+  generic int *var_gen;
+  global int *var_glob;
+  var_gen = 0 ? var_cond : var_glob;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__global int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  local int *var_loc;
+  var_gen = 0 ? var_cond : var_loc;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and '__local int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  constant int *var_const;
+  var_cond = 0 ? var_cond : var_const;
+#ifndef CONSTANT
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|generic}} int *' and '__constant int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  private int *var_priv;
+  var_gen = 0 ? var_cond : var_priv;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and 'int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  var_gen = 0 ? var_cond : var_gen;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__generic int *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  void *var_void_gen;
+  global char *var_glob_ch;
+  var_void_gen = 0 ? var_cond : var_glob_ch;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__global char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  local char *var_loc_ch;
+  var_void_gen = 0 ? var_cond : var_loc_ch;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and '__local char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  constant void *var_void_const;
+  constant char *var_const_ch;
+  var_void_const = 0 ? var_cond : var_const_ch;
+#ifndef CONSTANT
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|generic}} int *' and '__constant char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  private char *var_priv_ch;
+  var_void_gen = 0 ? var_cond : var_priv_ch;
+#ifndef GENERIC
+// expected-error-re@-2{{conditional operator with the second and third operands of type  ('__{{global|constant}} int *' and 'char *') which are pointers to non-overlapping address spaces}}
+#endif
+
+  generic char *var_gen_ch;
+  var_void_gen = 0 ? var_cond : var_gen_ch;
+#ifdef CONSTANT
+// expected-error@-2{{conditional operator with the second and third operands of type  ('__constant int *' and '__generic char *') which are pointers to non-overlapping address spaces}}
+#endif
+}
+
diff --git a/test/SemaOpenCL/as_type.cl b/test/SemaOpenCL/as_type.cl
new file mode 100644
index 0000000000000..f0bf4d7daef2a
--- /dev/null
+++ b/test/SemaOpenCL/as_type.cl
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 %s -emit-llvm -triple spir-unknown-unknown -o - -verify -fsyntax-only
+
+typedef __attribute__(( ext_vector_type(3) )) char char3;
+typedef __attribute__(( ext_vector_type(16) )) char char16;
+
+char3 f1(char16 x) {
+  return  __builtin_astype(x, char3); // expected-error{{invalid reinterpretation: sizes of 'char3' (vector of 3 'char' values) and 'char16' (vector of 16 'char' values) must match}}
+}
+
+char16 f3(int x) {
+  return __builtin_astype(x, char16); // expected-error{{invalid reinterpretation: sizes of 'char16' (vector of 16 'char' values) and 'int' must match}}
+}
+
diff --git a/test/SemaOpenCL/bool-vectors.cl b/test/SemaOpenCL/bool-vectors.cl
new file mode 100644
index 0000000000000..6df4d56530037
--- /dev/null
+++ b/test/SemaOpenCL/bool-vectors.cl
@@ -0,0 +1,3 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+
+typedef __attribute__((ext_vector_type(16))) _Bool bool8; // expected-error{{invalid vector element type 'bool'}}
diff --git a/test/SemaOpenCL/builtin.cl b/test/SemaOpenCL/builtin.cl
new file mode 100644
index 0000000000000..d48a0c449591a
--- /dev/null
+++ b/test/SemaOpenCL/builtin.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+
+// expected-no-diagnostics
+
+float __attribute__((overloadable)) acos(float);
+
+typedef float float4 __attribute__((ext_vector_type(4)));
+int printf(__constant const char* st, ...);
+
+void test(void)
+{
+  float4 a;
+  printf("%8.4v4hlf\n", a);
+}
diff --git a/test/SemaOpenCL/cl20-device-side-enqueue.cl b/test/SemaOpenCL/cl20-device-side-enqueue.cl
new file mode 100644
index 0000000000000..298b8109881d8
--- /dev/null
+++ b/test/SemaOpenCL/cl20-device-side-enqueue.cl
@@ -0,0 +1,172 @@
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -cl-std=CL2.0 -verify -pedantic -fsyntax-only -Wconversion -DWCONV
+
+// Diagnostic tests for different overloads of enqueue_kernel from Table 6.13.17.1 of OpenCL 2.0 Spec.
+kernel void enqueue_kernel_tests() {
+  queue_t default_queue;
+  unsigned flags = 0;
+  ndrange_t ndrange;
+  clk_event_t evt;
+  clk_event_t event_wait_list;
+  clk_event_t event_wait_list2[] = {evt, evt};
+  void *vptr;
+
+  // Testing the first overload type
+  enqueue_kernel(default_queue, flags, ndrange, ^(void) {
+    return 0;
+  });
+
+  enqueue_kernel(vptr, flags, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'queue_t' argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, vptr, ndrange, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'kernel_enqueue_flags_t' (i.e. uint) argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, vptr, ^(void) { // expected-error{{illegal call to enqueue_kernel, expected 'ndrange_t' argument type}}
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, ndrange, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+
+  enqueue_kernel(default_queue, flags, ndrange, ^(int i) { // expected-error{{blocks in this form of device side enqueue call are expected to have have no parameters}}
+    return 0;
+  });
+
+  // Testing the second overload type
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, ^(void) {
+    return 0;
+  });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, vptr, &evt, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+                                                               {
+                                                                 return 0;
+                                                               });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, vptr, ^(void) // expected-error{{illegal call to enqueue_kernel, expected 'clk_event_t *' argument type}}
+                                                                           {
+                                                                             return 0;
+                                                                           });
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, vptr); // expected-error{{illegal call to enqueue_kernel, expected block argument}}
+
+  // Testing the third overload type
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024);
+
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024L); // expected-error{{local memory sizes need to be specified as uint}}
+
+  char c;
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 c, 1024);
+#ifdef WCONV
+// expected-warning@-2{{implicit conversion changes signedness: 'char' to 'unsigned int'}}
+#endif
+
+  typedef void (^bl_A_t)(local void *);
+
+  const bl_A_t block_A = (bl_A_t) ^ (local void *a) {};
+
+  enqueue_kernel(default_queue, flags, ndrange, block_A, 1024);
+
+  typedef void (^bl_B_t)(local void *, local int *);
+
+  const bl_B_t block_B = (bl_B_t) ^ (local void *a, local int *b) {};
+
+  enqueue_kernel(default_queue, flags, ndrange, block_B, 1024, 1024); // expected-error{{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+
+  enqueue_kernel(default_queue, flags, ndrange, // expected-error{{mismatch in number of block parameters and local size arguments passed}}
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024);
+
+  float illegal_mem_size = (float)0.5f;
+  enqueue_kernel(default_queue, flags, ndrange,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 illegal_mem_size, illegal_mem_size); // expected-error{{local memory sizes need to be specified as uint}} expected-error{{local memory sizes need to be specified as uint}}
+#ifdef WCONV
+// expected-warning@-2{{implicit conversion turns floating-point number into integer: 'float' to 'unsigned int'}} expected-warning@-2{{implicit conversion turns floating-point number into integer: 'float' to 'unsigned int'}}
+#endif
+
+  // Testing the forth overload type
+  enqueue_kernel(default_queue, flags, ndrange, 1, event_wait_list2, &evt,
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024);
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt, // expected-error{{mismatch in number of block parameters and local size arguments passed}}
+                 ^(local void *a, local void *b) {
+                   return 0;
+                 },
+                 1024, 1024, 1024);
+
+  // More random misc cases that can't be deduced
+  enqueue_kernel(default_queue, flags, ndrange, 1, &event_wait_list, &evt); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}}
+
+  enqueue_kernel(default_queue, flags, ndrange, 1, 1); // expected-error{{illegal call to enqueue_kernel, incorrect argument types}}
+}
+
+// Diagnostic tests for get_kernel_work_group_size and allowed block parameter types in dynamic parallelism.
+kernel void work_group_size_tests() {
+  void (^const block_A)(void) = ^{
+    return;
+  };
+  void (^const block_B)(int) = ^(int a) {
+    return;
+  };
+  void (^const block_C)(local void *) = ^(local void *a) {
+    return;
+  };
+  void (^const block_D)(local int *) = ^(local int *a) {
+    return;
+  };
+
+  unsigned size = get_kernel_work_group_size(block_A);
+  size = get_kernel_work_group_size(block_C);
+  size = get_kernel_work_group_size(^(local void *a) {
+    return;
+  });
+  size = get_kernel_work_group_size(^(local int *a) { // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_work_group_size(block_B);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_work_group_size(block_D);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_work_group_size(^(int a) {  // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_work_group_size();          // expected-error {{too few arguments to function call, expected 1, have 0}}
+  size = get_kernel_work_group_size(1);         // expected-error{{expected block argument}}
+  size = get_kernel_work_group_size(block_A, 1); // expected-error{{too many arguments to function call, expected 1, have 2}}
+
+  size = get_kernel_preferred_work_group_size_multiple(block_A);
+  size = get_kernel_preferred_work_group_size_multiple(block_C);
+  size = get_kernel_preferred_work_group_size_multiple(^(local void *a) {
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(local int *a) { // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(int a) {  // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(block_B);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_preferred_work_group_size_multiple(block_D);   // expected-error {{blocks used in device side enqueue are expected to have parameters of type 'local void*'}}
+  size = get_kernel_preferred_work_group_size_multiple();          // expected-error {{too few arguments to function call, expected 1, have 0}}
+  size = get_kernel_preferred_work_group_size_multiple(1);         // expected-error{{expected block argument}}
+  size = get_kernel_preferred_work_group_size_multiple(block_A, 1); // expected-error{{too many arguments to function call, expected 1, have 2}}
+}
diff --git a/test/SemaOpenCL/clang-builtin-version.cl b/test/SemaOpenCL/clang-builtin-version.cl
new file mode 100644
index 0000000000000..8574682ab93b4
--- /dev/null
+++ b/test/SemaOpenCL/clang-builtin-version.cl
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 %s -fblocks -verify -pedantic -fsyntax-only -ferror-limit 100
+
+// Confirm CL2.0 Clang builtins are not available in earlier versions
+
+kernel void dse_builtins() {
+  int tmp;
+  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-warning{{implicit declaration of function 'enqueue_kernel' is invalid in C99}}
+    return;
+  });
+  unsigned size = get_kernel_work_group_size(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_work_group_size' is invalid in C99}}
+    return;
+  });
+  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in C99}}
+    return;
+  });
+}
+
+void pipe_builtins() {
+  int tmp;
+
+  read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'read_pipe' is invalid in C99}}
+  write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'write_pipe' is invalid in C99}}
+
+  reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'reserve_read_pipe' is invalid in C99}}
+  reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'reserve_write_pipe' is invalid in C99}}
+
+  work_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in C99}}
+  work_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in C99}}
+
+  sub_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in C99}}
+  sub_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in C99}}
+
+  commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'commit_read_pipe' is invalid in C99}}
+  commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'commit_write_pipe' is invalid in C99}}
+
+  work_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in C99}}
+  work_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in C99}}
+
+  sub_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in C99}}
+  sub_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in C99}}
+
+  get_pipe_num_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_num_packets' is invalid in C99}}
+  get_pipe_max_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_max_packets' is invalid in C99}}
+}
diff --git a/test/SemaOpenCL/event_t.cl b/test/SemaOpenCL/event_t.cl
index e09883948cc6d..990c06340942c 100644
--- a/test/SemaOpenCL/event_t.cl
+++ b/test/SemaOpenCL/event_t.cl
@@ -3,7 +3,7 @@
 event_t glb_evt; // expected-error {{the event_t type cannot be used to declare a program scope variable}}
 
 constant struct evt_s {
-  event_t evt;  // expected-error {{the event_t type cannot be used to declare a structure or union field}}
+  event_t evt; // expected-error {{the 'event_t' type cannot be used to declare a structure or union field}}
 } evt_str = {0};
 
 void foo(event_t evt); // expected-note {{passing argument to parameter 'evt' here}}
@@ -14,5 +14,6 @@ void kernel ker(event_t argevt) { // expected-error {{'event_t' cannot be used a
   foo(e);
   foo(0);
   foo(5); // expected-error {{passing 'int' to parameter of incompatible type 'event_t'}}
+  foo((event_t)1); // expected-error {{cannot cast non-zero value '1' to 'event_t'}}
 }
 
diff --git a/test/SemaOpenCL/extension-fp64-cl1.1.cl b/test/SemaOpenCL/extension-fp64-cl1.1.cl
deleted file mode 100644
index 7e852ae70ebbd..0000000000000
--- a/test/SemaOpenCL/extension-fp64-cl1.1.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL1.1
-
-void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-}
diff --git a/test/SemaOpenCL/extension-fp64.cl b/test/SemaOpenCL/extension-fp64.cl
deleted file mode 100644
index e0c2b1ea4b53d..0000000000000
--- a/test/SemaOpenCL/extension-fp64.cl
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
-
-void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
-}
diff --git a/test/SemaOpenCL/extension-version.cl b/test/SemaOpenCL/extension-version.cl
new file mode 100644
index 0000000000000..ae403aa1d1935
--- /dev/null
+++ b/test/SemaOpenCL/extension-version.cl
@@ -0,0 +1,249 @@
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple spir-unknown-unknown
+// RUN: %clang_cc1 -x cl -cl-std=CL %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.1 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL1.2 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+// RUN: %clang_cc1 -x cl -cl-std=CL2.0 %s -verify -triple spir-unknown-unknown -Wpedantic-core-features -DTEST_CORE_FEATURES
+
+#if __OPENCL_C_VERSION__ >= 200 && ! defined TEST_CORE_FEATURES
+// expected-no-diagnostics
+#endif
+
+// Extensions in all versions
+#ifndef cl_clang_storage_class_specifiers
+#error "Missing cl_clang_storage_class_specifiers define"
+#endif
+#pragma OPENCL EXTENSION cl_clang_storage_class_specifiers: enable
+
+#ifndef cl_khr_fp16
+#error "Missing cl_khr_fp16 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp16: enable
+
+#ifndef cl_khr_int64_base_atomics
+#error "Missing cl_khr_int64_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable
+
+#ifndef cl_khr_int64_extended_atomics
+#error "Missing cl_khr_int64_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics: enable
+
+#ifndef cl_khr_gl_sharing
+#error "Missing cl_khr_gl_sharing define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_sharing: enable
+
+#ifndef cl_khr_icd
+#error "Missing cl_khr_icd define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_icd: enable
+
+// Core features in CL 1.1
+
+#ifndef cl_khr_byte_addressable_store
+#error "Missing cl_khr_byte_addressable_store define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_byte_addressable_store' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_base_atomics
+#error "Missing cl_khr_global_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_global_int32_extended_atomics
+#error "Missing cl_khr_global_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_global_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_base_atomics
+#error "Missing cl_khr_local_int32_base_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_base_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#ifndef cl_khr_local_int32_extended_atomics
+#error "Missing cl_khr_local_int32_extended_atomics define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics: enable
+#if (__OPENCL_C_VERSION__ >= 110) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_local_int32_extended_atomics' is core feature or supported optional core feature - ignoring}}
+#endif
+
+#if (__OPENCL_C_VERSION__ < 110)
+// Deprecated abvoe 1.0
+#ifndef cl_khr_select_fprounding_mode
+#error "Missing cl_khr_select_fp_rounding_mode define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_select_fprounding_mode: enable
+#endif
+
+
+// Core feature in CL 1.2
+#ifndef cl_khr_fp64
+#error "Missing cl_khr_fp64 define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#if (__OPENCL_C_VERSION__ >= 120) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_fp64' is core feature or supported optional core feature - ignoring}}
+#endif
+
+//Core feature in CL 2.0
+#ifndef cl_khr_3d_image_writes
+#error "Missing cl_khr_3d_image_writes define"
+#endif
+#pragma OPENCL EXTENSION cl_khr_3d_image_writes: enable
+#if (__OPENCL_C_VERSION__ >= 200) && defined TEST_CORE_FEATURES
+// expected-warning@-2{{OpenCL extension 'cl_khr_3d_image_writes' is core feature or supported optional core feature - ignoring}}
+#endif
+
+
+
+#if (__OPENCL_C_VERSION__ >= 110)
+#ifndef cl_khr_gl_event
+#error "Missing cl_khr_gl_event define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_event' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_event: enable
+
+#if (__OPENCL_C_VERSION__ >= 110)
+#ifndef cl_khr_d3d10_sharing
+#error "Missing cl_khr_d3d10_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_d3d10_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d10_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_context_abort
+#error "Missing cl_context_abort define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_context_abort' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_context_abort: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_d3d11_sharing
+#error "Missing cl_khr_d3d11_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_d3d11_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_d3d11_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_dx9_media_sharing
+#error "Missing cl_khr_dx9_media_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_dx9_media_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_dx9_media_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_image2d_from_buffer
+#error "Missing cl_khr_image2d_from_buffer define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_image2d_from_buffer' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_image2d_from_buffer: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_initialize_memory
+#error "Missing cl_khr_initialize_memory define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_initialize_memory' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_initialize_memory: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_gl_depth_images
+#error "Missing cl_khr_gl_depth_images define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_depth_images' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_depth_images: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_gl_msaa_sharing
+#error "Missing cl_khr_gl_msaa_sharing define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_gl_msaa_sharing' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing: enable
+
+#if (__OPENCL_C_VERSION__ >= 120)
+#ifndef cl_khr_spir
+#error "Missing cl_khr_spir define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_spir' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_spir: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_egl_event
+#error "Missing cl_khr_egl_event define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_egl_event' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_event: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_egl_image
+#error "Missing cl_khr_egl_image define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_egl_image' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_egl_image: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_srgb_image_writes
+#error "Missing cl_khr_srgb_image_writes define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_srgb_image_writes' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_srgb_image_writes: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_subgroups
+#error "Missing cl_khr_subgroups define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_subgroups' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_subgroups: enable
+
+#if (__OPENCL_C_VERSION__ >= 200)
+#ifndef cl_khr_terminate_context
+#error "Missing cl_khr_terminate_context define"
+#endif
+#else
+// expected-warning@+2{{unsupported OpenCL extension 'cl_khr_terminate_context' - ignoring}}
+#endif
+#pragma OPENCL EXTENSION cl_khr_terminate_context: enable
diff --git a/test/SemaOpenCL/extensions.cl b/test/SemaOpenCL/extensions.cl
new file mode 100644
index 0000000000000..31224e0df7745
--- /dev/null
+++ b/test/SemaOpenCL/extensions.cl
@@ -0,0 +1,36 @@
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-std=CL1.1
+
+// Test with a target not supporting fp64.
+// RUN: %clang_cc1 %s -triple r600-unknown-unknown -target-cpu r600 -verify -pedantic -fsyntax-only -DNOFP64
+
+void f1(double da) { // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+  (void) 1.0; // expected-warning {{double precision constant requires cl_khr_fp64}}
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#ifdef NOFP64
+// expected-warning@-2{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif
+
+void f2(void) {
+  double d;
+#ifdef NOFP64
+// expected-error@-2{{use of type 'double' requires cl_khr_fp64 extension to be enabled}}
+#endif
+
+  (void) 1.0;
+#ifdef NOFP64
+// expected-warning@-2{{double precision constant requires cl_khr_fp64, casting to single precision}}
+#endif
+}
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+#ifdef NOFP64
+// expected-warning@-2{{unsupported OpenCL extension 'cl_khr_fp64' - ignoring}}
+#endif
+
+void f3(void) {
+  double d; // expected-error {{type 'double' requires cl_khr_fp64 extension}}
+}
diff --git a/test/SemaOpenCL/extern.cl b/test/SemaOpenCL/extern.cl
index b2e4857e28daa..5f1f9f80b0fae 100644
--- a/test/SemaOpenCL/extern.cl
+++ b/test/SemaOpenCL/extern.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -x cl -cl-std=CL1.2 -emit-llvm -ffake-address-space-map %s -o - -verify | FileCheck %s
+// RUN: %clang_cc1 -x cl -cl-opt-disable -cl-std=CL1.2 -emit-llvm -ffake-address-space-map %s -o - -verify | FileCheck %s
 // expected-no-diagnostics
 
 // CHECK: @foo = external addrspace(3) constant float
diff --git a/test/SemaOpenCL/half.cl b/test/SemaOpenCL/half.cl
index 11abf64633bec..dd7bb9ab8c5d4 100644
--- a/test/SemaOpenCL/half.cl
+++ b/test/SemaOpenCL/half.cl
@@ -1,6 +1,7 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -Wno-unused-value
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -Wno-unused-value -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
+constant float f = 1.0h; // expected-error{{half precision constant requires cl_khr_fp16}}
 
 half half_disabled(half *p, // expected-error{{declaring function return value of type 'half' is not allowed}}
                    half h)  // expected-error{{declaring function parameter of type 'half' is not allowed}}
@@ -12,6 +13,8 @@ half half_disabled(half *p, // expected-error{{declaring function return value o
 
   float c = 1.0f;
   b = (half) c;  // expected-error{{casting to type 'half' is not allowed}}
+  c = (float) 1.0h;  // expected-error{{half precision constant requires cl_khr_fp16}}
+  b = 1.0h; // expected-error{{half precision constant requires cl_khr_fp16}}
 
   half *allowed = &p[1];
   half *allowed2 = &*p;
@@ -22,6 +25,7 @@ half half_disabled(half *p, // expected-error{{declaring function return value o
 
 // Exactly the same as above but with the cl_khr_fp16 extension enabled.
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
+constant half a = 1.0h;
 half half_enabled(half *p, half h)
 {
   half a[2];
@@ -31,6 +35,8 @@ half half_enabled(half *p, half h)
 
   float c = 1.0f;
   b = (half) c;
+  c = (float) 1.0h;
+  b = 1.0h;
 
   half *allowed = &p[1];
   half *allowed2 = &*p;
diff --git a/test/SemaOpenCL/images.cl b/test/SemaOpenCL/images.cl
new file mode 100644
index 0000000000000..f963de4e1359d
--- /dev/null
+++ b/test/SemaOpenCL/images.cl
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only
+
+void img2d_ro(__read_only image2d_t img) {} // expected-note{{passing argument to parameter 'img' here}} expected-note{{passing argument to parameter 'img' here}}
+
+void imgage_access_test(image2d_t img2dro, write_only image2d_t img2dwo, image3d_t img3dro) {
+  img2d_ro(img2dro);
+  img2d_ro(img2dwo); // expected-error{{passing '__write_only image2d_t' to parameter of incompatible type '__read_only image2d_t'}}
+  img2d_ro(img3dro); // expected-error{{passing '__read_only image3d_t' to parameter of incompatible type '__read_only image2d_t'}}
+}
diff --git a/test/SemaOpenCL/invalid-block.cl b/test/SemaOpenCL/invalid-block.cl
new file mode 100644
index 0000000000000..6721d0ea234a0
--- /dev/null
+++ b/test/SemaOpenCL/invalid-block.cl
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -verify -fblocks -cl-std=CL2.0 %s
+
+// OpenCL v2.0 s6.12.5
+void f0(int (^const bl)());
+// All blocks declarations must be const qualified and initialized.
+void f1() {
+  int (^bl1)() = ^() {return 1;};
+  int (^const bl2)() = ^(){return 1;};
+  f0(bl1);
+  f0(bl2);
+  bl1 = bl2; // expected-error{{invalid operands to binary expression ('int (^const)()' and 'int (^const)()')}}
+  int (^const bl3)(); // expected-error{{invalid block variable declaration - must be initialized}}
+}
+
+// A block with extern storage class is not allowed.
+extern int (^bl)() = ^(){return 1;}; // expected-error{{invalid block variable declaration - using 'extern' storage class is disallowed}}
+void f2() {
+  extern int (^bl)() = ^(){return 1;}; // expected-error{{invalid block variable declaration - using 'extern' storage class is disallowed}}
+}
+
+// A block cannot be the return value of a function.
+typedef int (^bl_t)(void);
+bl_t f3(bl_t bl); // expected-error{{declaring function return value of type 'bl_t' (aka 'int (^const)(void)') is not allowed}}
+
+struct bl_s {
+  int (^bl)(void); // expected-error {{the 'int (^const)(void)' type cannot be used to declare a structure or union field}}
+};
+
+void f4() {
+  __block int a = 10; // expected-error {{the __block storage type is not permitted}}
+}
+
+// A block with variadic argument is not allowed.
+int (^bl)(int, ...) = ^int(int I, ...) { // expected-error {{invalid block prototype, variadic arguments are not allowed in OpenCL}}
+  return 0;
+};
+
+// A block can't be used to declare an array
+typedef int (^bl1_t)(int);
+void f5(int i) {
+  bl1_t bl1 = ^(int i) {return 1;};
+  bl1_t bl2 = ^(int i) {return 2;};
+  bl1_t arr[] = {bl1, bl2}; // expected-error {{array of 'bl1_t' (aka 'int (^const)(int)') type is invalid in OpenCL}}
+  int tmp = i ? bl1(i)      // expected-error {{block type cannot be used as expression in ternary expression in OpenCL}}
+              : bl2(i);     // expected-error {{block type cannot be used as expression in ternary expression in OpenCL}}
+}
+// A block pointer type and all pointer operations are disallowed
+void f6(bl1_t * bl_ptr) { // expected-error{{pointer to type '__generic bl1_t' (aka 'int (^const __generic)(int)') is invalid in OpenCL}}
+  bl1_t bl = ^(int i) {return 1;};
+  bl1_t *p; // expected-error {{pointer to type '__generic bl1_t' (aka 'int (^const __generic)(int)') is invalid in OpenCL}}
+  *bl;  // expected-error {{invalid argument type 'bl1_t' (aka 'int (^const)(int)') to unary expression}}
+  &bl; // expected-error {{invalid argument type 'bl1_t' (aka 'int (^const)(int)') to unary expression}}
+}
diff --git a/test/SemaOpenCL/invalid-image.cl b/test/SemaOpenCL/invalid-image.cl
new file mode 100644
index 0000000000000..d15746fbab911
--- /dev/null
+++ b/test/SemaOpenCL/invalid-image.cl
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -verify %s
+
+void test1(image1d_t *i) {} // expected-error{{pointer to type '__read_only image1d_t' is invalid in OpenCL}}
+
+void test2(image1d_t i) {
+  image1d_t ti;            // expected-error{{type '__read_only image1d_t' can only be used as a function parameter}}
+  image1d_t ai[] = {i, i}; // expected-error{{array of '__read_only image1d_t' type is invalid in OpenCL}}
+  i=i; // expected-error{{invalid operands to binary expression ('__read_only image1d_t' and '__read_only image1d_t')}}
+  i+1; // expected-error{{invalid operands to binary expression ('__read_only image1d_t' and 'int')}}
+  &i; // expected-error{{invalid argument type '__read_only image1d_t' to unary expression}}
+  *i; // expected-error{{invalid argument type '__read_only image1d_t' to unary expression}}
+}
+
+image1d_t test3() {} // expected-error{{declaring function return value of type '__read_only image1d_t' is not allowed}}
diff --git a/test/SemaOpenCL/invalid-kernel-attrs.cl b/test/SemaOpenCL/invalid-kernel-attrs.cl
index 4b4fdf79e3d14..cedbb0664675f 100644
--- a/test/SemaOpenCL/invalid-kernel-attrs.cl
+++ b/test/SemaOpenCL/invalid-kernel-attrs.cl
@@ -28,8 +28,6 @@ constant int foo3 __attribute__((vec_type_hint(char))) = 0; // expected-error {{
 
 void f_kernel_image2d_t( kernel image2d_t image ) { // expected-error {{'kernel' attribute only applies to functions}}
   int __kernel x; // expected-error {{'__kernel' attribute only applies to functions}}
-  read_only int i; // expected-error {{'read_only' attribute only applies to parameters}}
-  __write_only int j; // expected-error {{'__write_only' attribute only applies to parameters}}
 }
 
 kernel __attribute__((reqd_work_group_size(1,2,0))) void kernel11(){} // expected-error {{'reqd_work_group_size' attribute must be greater than 0}}
diff --git a/test/SemaOpenCL/invalid-kernel-parameters.cl b/test/SemaOpenCL/invalid-kernel-parameters.cl
index de32eae8821e6..e2e48e83c6b9f 100644
--- a/test/SemaOpenCL/invalid-kernel-parameters.cl
+++ b/test/SemaOpenCL/invalid-kernel-parameters.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify %s -triple spir-unknown-unknown
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
@@ -24,7 +24,10 @@ kernel void bool_in_struct_arg(ContainsBool x) { } // expected-error{{'ContainsB
 
 typedef struct FooImage2D // expected-note{{within field of type 'FooImage2D' declared here}}
 {
-  image2d_t imageField; // expected-note{{field of illegal type 'image2d_t' declared here}}
+  // TODO: Clean up needed - we don't really need to check for image, event, etc
+  // as a note here any longer.
+  // They are diagnosed as an error for all struct fields (OpenCL v1.2 s6.9b,r).
+  image2d_t imageField; // expected-note{{field of illegal type '__read_only image2d_t' declared here}} expected-error{{the '__read_only image2d_t' type cannot be used to declare a structure or union field}}
 } FooImage2D;
 
 kernel void image_in_struct_arg(FooImage2D arg) { } // expected-error{{struct kernel parameters may not contain pointers}}
diff --git a/test/SemaOpenCL/invalid-logical-ops-1.2.cl b/test/SemaOpenCL/invalid-logical-ops-1.2.cl
index 7ba1adbf53e0e..bee52396cc6c4 100644
--- a/test/SemaOpenCL/invalid-logical-ops-1.2.cl
+++ b/test/SemaOpenCL/invalid-logical-ops-1.2.cl
@@ -1,6 +1,7 @@
 // RUN: %clang_cc1 %s -verify -cl-std=CL1.2 -triple x86_64-unknown-linux-gnu
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
 typedef __attribute__((ext_vector_type(4))) float float4;
 typedef __attribute__((ext_vector_type(4))) double double4;
 typedef __attribute__((ext_vector_type(4))) int int4;
diff --git a/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl b/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl
new file mode 100644
index 0000000000000..386c6b6c74561
--- /dev/null
+++ b/test/SemaOpenCL/invalid-pipe-builtin-cl2.0.cl
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
+
+void test1(read_only pipe int p, global int* ptr){
+  int tmp;
+  reserve_id_t rid;
+
+  // read/write_pipe
+  read_pipe(p, &tmp);
+  read_pipe(p, ptr);
+  read_pipe(tmp, p);    // expected-error {{first argument to 'read_pipe' must be a pipe type}}
+  read_pipe(p);   // expected-error {{invalid number of arguments to function: 'read_pipe'}}
+  read_pipe(p, rid, tmp, ptr);
+  read_pipe(p, tmp, tmp, ptr);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'reserve_id_t' having 'int')}}
+  read_pipe(p, rid, rid, ptr);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'unsigned int' having 'reserve_id_t')}}
+  read_pipe(p, tmp);   // expected-error {{invalid argument type to function 'read_pipe' (expecting 'int *' having 'int')}}
+  write_pipe(p, ptr);    // expected-error {{invalid pipe access modifier (expecting write_only)}}
+  write_pipe(p, rid, tmp, ptr);    // expected-error {{invalid pipe access modifier (expecting write_only)}}
+
+  // reserve_read/write_pipe
+  reserve_read_pipe(p, tmp);
+  reserve_read_pipe(p, ptr);    // expected-error{{invalid argument type to function 'reserve_read_pipe' (expecting 'unsigned int' having '__global int *')}}
+  work_group_reserve_read_pipe(tmp, tmp);    // expected-error{{first argument to 'work_group_reserve_read_pipe' must be a pipe type}}
+  sub_group_reserve_write_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting write_only)}}
+
+  // commit_read/write_pipe
+  commit_read_pipe(p, rid);
+  commit_read_pipe(tmp, rid);    // expected-error{{first argument to 'commit_read_pipe' must be a pipe type}}
+  work_group_commit_read_pipe(p, tmp);    // expected-error{{invalid argument type to function 'work_group_commit_read_pipe' (expecting 'reserve_id_t' having 'int')}}
+  sub_group_commit_write_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting write_only)}}
+}
+
+void test2(write_only pipe int p, global int* ptr){
+  int tmp;
+  reserve_id_t rid;
+
+  // read/write_pipe
+  write_pipe(p, &tmp);
+  write_pipe(p, ptr);
+  write_pipe(tmp, p);    // expected-error {{first argument to 'write_pipe' must be a pipe type}}
+  write_pipe(p);   // expected-error {{invalid number of arguments to function: 'write_pipe'}}
+  write_pipe(p, rid, tmp, ptr);
+  write_pipe(p, tmp, tmp, ptr);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'reserve_id_t' having 'int')}}
+  write_pipe(p, rid, rid, ptr);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'unsigned int' having 'reserve_id_t')}}
+  write_pipe(p, tmp);   // expected-error {{invalid argument type to function 'write_pipe' (expecting 'int *' having 'int')}}
+  read_pipe(p, ptr);    // expected-error {{invalid pipe access modifier (expecting read_only)}}
+  read_pipe(p, rid, tmp, ptr);    // expected-error {{invalid pipe access modifier (expecting read_only)}}
+
+  // reserve_read/write_pipe
+  reserve_write_pipe(p, tmp);
+  reserve_write_pipe(p, ptr);    // expected-error{{invalid argument type to function 'reserve_write_pipe' (expecting 'unsigned int' having '__global int *')}}
+  work_group_reserve_write_pipe(tmp, tmp);    // expected-error{{first argument to 'work_group_reserve_write_pipe' must be a pipe type}}
+  sub_group_reserve_read_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting read_only)}}
+
+  // commit_read/write_pipe
+  commit_write_pipe(p, rid);
+  commit_write_pipe(tmp, rid);    // expected-error{{first argument to 'commit_write_pipe' must be a pipe type}}
+  work_group_commit_write_pipe(p, tmp);    // expected-error{{invalid argument type to function 'work_group_commit_write_pipe' (expecting 'reserve_id_t' having 'int')}}
+  sub_group_commit_read_pipe(p, tmp);    // expected-error{{invalid pipe access modifier (expecting read_only)}}
+}
+
+void test3(){
+  int tmp;
+  get_pipe_num_packets(tmp);    // expected-error {{first argument to 'get_pipe_num_packets' must be a pipe type}}
+  get_pipe_max_packets(tmp);    // expected-error {{first argument to 'get_pipe_max_packets' must be a pipe type}}
+}
diff --git a/test/SemaOpenCL/invalid-pipes-cl2.0.cl b/test/SemaOpenCL/invalid-pipes-cl2.0.cl
index ee36892b93d41..1993df5c44d41 100644
--- a/test/SemaOpenCL/invalid-pipes-cl2.0.cl
+++ b/test/SemaOpenCL/invalid-pipes-cl2.0.cl
@@ -1,8 +1,22 @@
 // RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
 
-void test1(pipe int *p){// expected-error {{pipes packet types cannot be of reference type}}
+void test1(pipe int *p) {// expected-error {{pipes packet types cannot be of reference type}}
 }
-void test2(pipe p){// expected-error {{missing actual type specifier for pipe}}
+void test2(pipe p) {// expected-error {{missing actual type specifier for pipe}}
 }
-void test3(int pipe p){// expected-error {{cannot combine with previous 'int' declaration specifier}}
+void test3(int pipe p) {// expected-error {{cannot combine with previous 'int' declaration specifier}}
 }
+void test4() {
+  pipe int p; // expected-error {{type 'pipe int' can only be used as a function parameter}}
+  //TODO: fix parsing of this pipe int (*p);
+}
+
+void test5(pipe int p) {
+  p+p; // expected-error{{invalid operands to binary expression ('pipe int' and 'pipe int')}}
+  p=p; // expected-error{{invalid operands to binary expression ('pipe int' and 'pipe int')}}
+  &p; // expected-error{{invalid argument type 'pipe int' to unary expression}}
+  *p; // expected-error{{invalid argument type 'pipe int' to unary expression}}
+}
+
+typedef pipe int pipe_int_t;
+pipe_int_t test6() {} // expected-error{{declaring function return value of type 'pipe_int_t' (aka 'pipe int') is not allowed}}
diff --git a/test/SemaOpenCL/nosvm.cl b/test/SemaOpenCL/nosvm.cl
new file mode 100644
index 0000000000000..658cb3aaf4d1e
--- /dev/null
+++ b/test/SemaOpenCL/nosvm.cl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -D CL20 %s
+// RUN: %clang_cc1 -verify -x c -D NOCL %s
+
+#ifndef NOCL
+kernel void f(__attribute__((nosvm)) global int* a);
+#ifndef CL20
+// expected-error@-2 {{'nosvm' attribute requires OpenCL version 2.0}}
+#else
+// expected-warning@-4 {{'nosvm' attribute is deprecated and ignored in OpenCL version 2.0}}
+#endif
+
+__attribute__((nosvm)) void g(); // expected-warning {{'nosvm' attribute only applies to variables}}
+
+#else
+void f(__attribute__((nosvm)) int* a); // expected-warning {{'nosvm' attribute ignored}}
+#endif
diff --git a/test/SemaOpenCL/optional-core-fp64-cl1.2.cl b/test/SemaOpenCL/optional-core-fp64-cl1.2.cl
deleted file mode 100644
index e0f7f1db4ff1c..0000000000000
--- a/test/SemaOpenCL/optional-core-fp64-cl1.2.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL1.2
-// expected-no-diagnostics
-
-void f1(double da) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d;
-}
diff --git a/test/SemaOpenCL/optional-core-fp64-cl2.0.cl b/test/SemaOpenCL/optional-core-fp64-cl2.0.cl
deleted file mode 100644
index 832529d4adf7c..0000000000000
--- a/test/SemaOpenCL/optional-core-fp64-cl2.0.cl
+++ /dev/null
@@ -1,20 +0,0 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
-// expected-no-diagnostics
-
-void f1(double da) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-void f2(void) {
-  double d;
-  (void) 1.0;
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-void f3(void) {
-  double d;
-}
diff --git a/test/SemaOpenCL/sampler_t.cl b/test/SemaOpenCL/sampler_t.cl
index 96f6dbf086b74..0553db8fd662b 100644
--- a/test/SemaOpenCL/sampler_t.cl
+++ b/test/SemaOpenCL/sampler_t.cl
@@ -2,12 +2,30 @@
 
 constant sampler_t glb_smp = 5;
 
-void foo(sampler_t); 
+void foo(sampler_t);
+
+constant struct sampler_s {
+  sampler_t smp; // expected-error{{the 'sampler_t' type cannot be used to declare a structure or union field}}
+} sampler_str = {0};
 
 void kernel ker(sampler_t argsmp) {
-  local sampler_t smp; // expected-error {{sampler type cannot be used with the __local and __global address space qualifiers}}
+  local sampler_t smp; // expected-error{{sampler type cannot be used with the __local and __global address space qualifiers}}
   const sampler_t const_smp = 7;
   foo(glb_smp);
   foo(const_smp);
-  foo(5); // expected-error {{sampler_t variable required - got 'int'}}
+  foo(5); // expected-error{{sampler_t variable required - got 'int'}}
+  sampler_t sa[] = {argsmp, const_smp}; // expected-error {{array of 'sampler_t' type is invalid in OpenCL}}
 }
+
+void bad(sampler_t*); // expected-error{{pointer to type 'sampler_t' is invalid in OpenCL}}
+
+void bar() {
+  sampler_t smp1 = 7;
+  sampler_t smp2 = 2;
+  smp1=smp2; //expected-error{{invalid operands to binary expression ('sampler_t' and 'sampler_t')}}
+  smp1+1; //expected-error{{invalid operands to binary expression ('sampler_t' and 'int')}}
+  &smp1; //expected-error{{invalid argument type 'sampler_t' to unary expression}}
+  *smp2; //expected-error{{invalid argument type 'sampler_t' to unary expression}}
+}
+
+sampler_t bad(); //expected-error{{declaring function return value of type 'sampler_t' is not allowed}}
diff --git a/test/SemaOpenCL/storageclass-cl20.cl b/test/SemaOpenCL/storageclass-cl20.cl
index c8e7faa208f1c..1eba64b44c8a0 100644
--- a/test/SemaOpenCL/storageclass-cl20.cl
+++ b/test/SemaOpenCL/storageclass-cl20.cl
@@ -1,15 +1,19 @@
-// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -DCL20 -cl-std=CL2.0
+// RUN: %clang_cc1 %s -verify -pedantic -fsyntax-only -cl-std=CL2.0
 
 static constant int G1 = 0;
 int G2 = 0;
 global int G3 = 0;
-local int G4 = 0;// expected-error{{program scope variable must reside in global or constant address space}}
+local int G4 = 0;              // expected-error{{program scope variable must reside in global or constant address space}}
 
 void kernel foo() {
   static int S1 = 5;
   static global int S2 = 5;
-  static private int S3 = 5;// expected-error{{program scope variable must reside in global or constant address space}}
+  static private int S3 = 5;   // expected-error{{static local variable must reside in global or constant address space}}
 
   constant int L1 = 0;
   local int L2;
+  global int L3; // expected-error{{function scope variable cannot be declared in global address space}}
+
+  extern global int G5;
+  extern int G6; // expected-error{{extern variable must reside in global or constant address space}}
 }
diff --git a/test/SemaOpenCL/storageclass.cl b/test/SemaOpenCL/storageclass.cl
index c7d8ab984687c..a93f8244dcbd1 100644
--- a/test/SemaOpenCL/storageclass.cl
+++ b/test/SemaOpenCL/storageclass.cl
@@ -13,7 +13,8 @@ void kernel foo() {
   constant int L1 = 0;
   local int L2;
 
-  auto int L3 = 7; // expected-error{{OpenCL does not support the 'auto' storage class specifier}}
+  auto int L3 = 7; // expected-error{{OpenCL version 1.2 does not support the 'auto' storage class specifier}}
+  global int L4;   // expected-error{{function scope variable cannot be declared in global address space}}
 }
 
 static void kernel bar() { // expected-error{{kernel functions cannot be declared static}}
@@ -26,4 +27,6 @@ void f() {
     constant int L1 = 0; // expected-error{{non-kernel function variable cannot be declared in constant address space}}
     local int L2;        // expected-error{{non-kernel function variable cannot be declared in local address space}}
   }
+  global int L3; // expected-error{{function scope variable cannot be declared in global address space}}
+  extern constant float L4;
 }
diff --git a/test/SemaOpenCL/to_addr_builtin.cl b/test/SemaOpenCL/to_addr_builtin.cl
new file mode 100644
index 0000000000000..a145626cfcee4
--- /dev/null
+++ b/test/SemaOpenCL/to_addr_builtin.cl
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -verify -fsyntax-only %s
+// RUN: %clang_cc1 -verify -fsyntax-only -cl-std=CL2.0 %s
+
+void test(void) {
+  global int *glob;
+  local int *loc;
+  constant int *con;
+  typedef constant int const_int_ty;
+  const_int_ty *con_typedef;
+
+  glob = to_global(glob, loc);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{implicit declaration of function 'to_global' is invalid in C99}}
+  // expected-warning@-3{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-5{{invalid number of arguments to function: 'to_global'}}
+#endif
+
+  int x;
+  glob = to_global(x);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument x to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  glob = to_global(con);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument con to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  glob = to_global(con_typedef);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
+#else
+  // expected-error@-4{{invalid argument con_typedef to function: 'to_global', expecting a generic pointer argument}}
+#endif
+
+  loc = to_global(glob);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion assigning to '__local int *' from 'int'}}
+#else
+  // expected-error@-4{{assigning '__global int *' to '__local int *' changes address space of pointer}}
+#endif
+
+  global char *glob_c = to_global(loc);
+#if __OPENCL_C_VERSION__ < CL_VERSION_2_0
+  // expected-warning@-2{{incompatible integer to pointer conversion initializing '__global char *' with an expression of type 'int'}}
+#else
+  // expected-warning@-4{{incompatible pointer types initializing '__global char *' with an expression of type '__global int *'}}
+#endif
+
+}
diff --git a/test/SemaOpenCL/unroll-hint.cl b/test/SemaOpenCL/unroll-hint.cl
new file mode 100644
index 0000000000000..996986603695a
--- /dev/null
+++ b/test/SemaOpenCL/unroll-hint.cl
@@ -0,0 +1,30 @@
+//RUN: %clang_cc1 -O0 -fsyntax-only -verify %s
+//RUN: %clang_cc1 -O0 -cl-std=CL2.0 -fsyntax-only -verify -DCL20 %s
+
+kernel void D (global int *x) {
+  int i = 10;
+#ifndef CL20
+  // expected-error@+2 {{'opencl_unroll_hint' attribute requires OpenCL version 2.0 or above}}
+#endif
+  __attribute__((opencl_unroll_hint))
+  do {
+  } while(i--);
+}
+
+#ifdef CL20
+kernel void C (global int *x) {
+  int I = 3;
+  __attribute__((opencl_unroll_hint(I))) // expected-error {{'opencl_unroll_hint' attribute requires an integer constant}}
+  while (I--);
+}
+
+kernel void E() {
+  __attribute__((opencl_unroll_hint(2,4))) // expected-error {{'opencl_unroll_hint' attribute takes no more than 1 argument}}
+  for(int i=0; i<100; i++);
+}
+
+kernel void F() {
+  __attribute__((opencl_unroll_hint(-1))) // expected-error {{'opencl_unroll_hint' attribute requires a positive integral compile time constant expression}}
+  for(int i=0; i<100; i++);
+}
+#endif
diff --git a/test/SemaOpenCL/unsupported.cl b/test/SemaOpenCL/unsupported.cl
index bb9da4b272d72..a39a61b9542a0 100644
--- a/test/SemaOpenCL/unsupported.cl
+++ b/test/SemaOpenCL/unsupported.cl
@@ -7,3 +7,7 @@ struct {
 void no_vla(int n) {
   int a[n]; // expected-error {{variable length arrays are not supported in OpenCL}}
 }
+
+void no_logxor(int n) {
+  int logxor = n ^^ n; // expected-error {{^^ is a reserved operator in OpenCL}}
+}
diff --git a/test/SemaTemplate/alias-templates.cpp b/test/SemaTemplate/alias-templates.cpp
index 1849ff64026b7..b7078353ff1b3 100644
--- a/test/SemaTemplate/alias-templates.cpp
+++ b/test/SemaTemplate/alias-templates.cpp
@@ -221,3 +221,9 @@ namespace PR14858 {
   template<typename ...T, typename ...U> void h(X<T...> &) {}
   template<typename ...T, typename ...U> void h(X<U...> &) {} // ok, different
 }
+
+namespace redecl {
+  template<typename> using A = int;
+  template<typename = void> using A = int;
+  A<> a; // ok
+}
diff --git a/test/SemaTemplate/class-template-spec.cpp b/test/SemaTemplate/class-template-spec.cpp
index 0292c1b8ff20f..86cace19dbfbc 100644
--- a/test/SemaTemplate/class-template-spec.cpp
+++ b/test/SemaTemplate/class-template-spec.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 template<typename T, typename U = int> struct A; // expected-note {{template is declared here}} \
                                                  // expected-note{{explicitly specialized}}
 
@@ -75,7 +77,10 @@ struct A<double> { }; // expected-error{{template specialization requires 'templ
 template<> struct ::A<double>;
 
 namespace N {
-  template<typename T> struct B; // expected-note 2{{explicitly specialized}}
+  template<typename T> struct B; // expected-note {{explicitly specialized}}
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{explicitly specialized}}
+#endif
 
   template<> struct ::N::B<char>; // okay
   template<> struct ::N::B<short>; // okay
@@ -86,7 +91,11 @@ namespace N {
 
 template<> struct N::B<int> { }; // okay
 
-template<> struct N::B<float> { }; // expected-warning{{C++11 extension}}
+template<> struct N::B<float> { };
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{first declaration of class template specialization of 'B' outside namespace 'N' is a C++11 extension}}
+#endif
+
 
 namespace M {
   template<> struct ::N::B<short> { }; // expected-error{{class template specialization of 'B' not in a namespace enclosing 'N'}}
@@ -142,13 +151,26 @@ namespace PR18009 {
 }
 
 namespace PR16519 {
-  template<typename T, T...N> struct integer_sequence { typedef T value_type; }; // expected-warning {{extension}}
+  template<typename T, T...N> struct integer_sequence { typedef T value_type; };
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{variadic templates are a C++11 extension}}
+#endif
 
   template<typename T> struct __make_integer_sequence;
-  template<typename T, T N> using make_integer_sequence = typename __make_integer_sequence<T>::template make<N, N % 2>::type; // expected-warning {{extension}}
-
-  template<typename T, typename T::value_type ...Extra> struct __make_integer_sequence_impl; // expected-warning {{extension}}
-  template<typename T, T ...N, T ...Extra> struct __make_integer_sequence_impl<integer_sequence<T, N...>, Extra...> { // expected-warning 2{{extension}}
+  template<typename T, T N> using make_integer_sequence = typename __make_integer_sequence<T>::template make<N, N % 2>::type;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
+
+  template<typename T, typename T::value_type ...Extra> struct __make_integer_sequence_impl;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{variadic templates are a C++11 extension}}
+#endif
+
+  template<typename T, T ...N, T ...Extra> struct __make_integer_sequence_impl<integer_sequence<T, N...>, Extra...> {
+#if __cplusplus <= 199711L
+  // expected-warning@-2 2 {{variadic templates are a C++11 extension}}
+#endif
     typedef integer_sequence<T, N..., sizeof...(N) + N..., Extra...> type;
   };
 
@@ -160,8 +182,15 @@ namespace PR16519 {
     template<T N, typename Dummy> struct make<N, 1, Dummy> : __make_integer_sequence_impl<make_integer_sequence<T, N/2>, N - 1> {};
   };
 
-  using X = make_integer_sequence<int, 5>; // expected-warning {{extension}}
-  using X = integer_sequence<int, 0, 1, 2, 3, 4>; // expected-warning {{extension}}
+  using X = make_integer_sequence<int, 5>;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
+
+  using X = integer_sequence<int, 0, 1, 2, 3, 4>;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
 }
 
 namespace DefaultArgVsPartialSpec {
diff --git a/test/SemaTemplate/cxx1z-fold-expressions.cpp b/test/SemaTemplate/cxx1z-fold-expressions.cpp
index 8bb79113fa9d2..aefee92f648a6 100644
--- a/test/SemaTemplate/cxx1z-fold-expressions.cpp
+++ b/test/SemaTemplate/cxx1z-fold-expressions.cpp
@@ -25,10 +25,6 @@ constexpr bool check() {
 static_assert(check());
 
 template<int ...N> void empty() {
-  static_assert((N + ...) == 0);
-  static_assert((N * ...) == 1);
-  static_assert((N | ...) == 0);
-  static_assert((N & ...) == -1);
   static_assert((N || ...) == false);
   static_assert((N && ...) == true);
   (N, ...);
@@ -36,14 +32,19 @@ template<int ...N> void empty() {
 template void empty<>();
 
 // An empty fold-expression isn't a null pointer just because it's an integer
-// with value 0.
+// with value 0. (This is no longer an issue since empty pack expansions don't
+// produce integers any more.)
 template<int ...N> void null_ptr() {
-  void *p = (N + ...); // expected-error {{rvalue of type 'int'}}
-  void *q = (N | ...); // expected-error {{rvalue of type 'int'}}
+  void *p = (N || ...); // expected-error {{rvalue of type 'bool'}}
+  void *q = (N , ...); // expected-error {{rvalue of type 'void'}}
 }
 template void null_ptr<>(); // expected-note {{in instantiation of}}
 
 template<int ...N> void bad_empty() {
+  (N + ...); // expected-error {{empty expansion for operator '+' with no fallback}}
+  (N * ...); // expected-error {{empty expansion for operator '*' with no fallback}}
+  (N | ...); // expected-error {{empty expansion for operator '|' with no fallback}}
+  (N & ...); // expected-error {{empty expansion for operator '&' with no fallback}}
   (N - ...); // expected-error {{empty expansion for operator '-' with no fallback}}
   (N / ...); // expected-error {{empty expansion for operator '/' with no fallback}}
   (N % ...); // expected-error {{empty expansion for operator '%' with no fallback}}
diff --git a/test/SemaTemplate/deduction.cpp b/test/SemaTemplate/deduction.cpp
index 6826774a00203..d024c3147735a 100644
--- a/test/SemaTemplate/deduction.cpp
+++ b/test/SemaTemplate/deduction.cpp
@@ -218,3 +218,50 @@ namespace NonDeducedNestedNameSpecifier {
   template<typename T> int f(A<T>, typename A<T>::template B<T>);
   int k = f(A<int>(), 0);
 }
+
+namespace PR27601_RecursivelyInheritedBaseSpecializationsDeductionAmbiguity {
+namespace ns1 {
+
+template<class...> struct B { };
+template<class H, class ... Ts> struct B<H, Ts...> : B<> { };
+template<class ... Ts> struct D : B<Ts...> { };
+
+template<class T, class ... Ts> void f(B<T, Ts...> &) { }
+
+int main() {
+  D<int, char> d;
+  f<int>(d);
+}
+} //end ns1
+
+namespace ns2 {
+
+template <int i, typename... Es> struct tup_impl;
+
+template <int i> struct tup_impl<i> {}; // empty tail
+
+template <int i, typename Head, typename... Tail>
+struct tup_impl<i, Head, Tail...> : tup_impl<i + 1, Tail...> {
+  using value_type = Head;
+  Head head;
+};
+
+template <typename... Es> struct tup : tup_impl<0, Es...> {};
+
+template <typename Head, int i, typename... Tail>
+Head &get_helper(tup_impl<i, Head, Tail...> &t) {
+  return t.head;
+}
+
+template <typename Head, int i, typename... Tail>
+Head const &get_helper(tup_impl<i, Head, Tail...> const &t) {
+  return t.head;
+}
+
+int main() {
+  tup<int, double, char> t;
+  get_helper<double>(t);
+  return 0;
+}
+} // end ns2 
+}
+\ No newline at end of file
diff --git a/test/SemaTemplate/default-arguments-cxx0x.cpp b/test/SemaTemplate/default-arguments-cxx0x.cpp
index 0c97c2056b757..c52899a8e6d15 100644
--- a/test/SemaTemplate/default-arguments-cxx0x.cpp
+++ b/test/SemaTemplate/default-arguments-cxx0x.cpp
@@ -75,3 +75,13 @@ namespace rdar23810407 {
     g<int>();
   }
 }
+
+// rdar://problem/24480205
+namespace PR13986 {
+  constexpr unsigned Dynamic = 0;
+  template <unsigned> class A { template <unsigned = Dynamic> void m_fn1(); };
+  class Test {
+    ~Test() {}
+    A<1> m_target;
+  };
+}
diff --git a/test/SemaTemplate/extern-templates.cpp b/test/SemaTemplate/extern-templates.cpp
index eca64ed595ebf..5eb9c9db127c8 100644
--- a/test/SemaTemplate/extern-templates.cpp
+++ b/test/SemaTemplate/extern-templates.cpp
@@ -1,4 +1,5 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple i686-pc-win32 -fsyntax-only -verify %s -DMS
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu-pc-win32 -fsyntax-only -verify %s
 
 template<typename T>
 class X0 {
@@ -21,12 +22,20 @@ extern template class X0<int*>;
 
 template<typename T>
 void X0<T>::Inner::g(T t) {
-  t = 17; // expected-error{{incompatible}}
+#ifdef MS
+  t = 17; // expected-error{{assigning to 'long *' from incompatible}} expected-error{{assigning to 'int *' from incompatible}}
+#else
+  t = 17; // expected-error{{assigning to 'long *' from incompatible}}
+#endif
 }
 
 void test_intptr(X0<int*> xi, X0<int*>::Inner xii) {
   xi.f(0);
+#ifdef MS
+  xii.g(0); // expected-note {{instantiation}}
+#else
   xii.g(0);
+#endif
 }
 
 extern template class X0<long*>; 
diff --git a/test/SemaTemplate/instantiate-cast.cpp b/test/SemaTemplate/instantiate-cast.cpp
index b3babf12981e4..32a1cfdfec9db 100644
--- a/test/SemaTemplate/instantiate-cast.cpp
+++ b/test/SemaTemplate/instantiate-cast.cpp
@@ -1,6 +1,13 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-
-struct A { int x; }; // expected-note 2 {{candidate constructor}}
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
+struct A { int x; };
+// expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const A' for 1st argument}}
+#if __cplusplus >= 201103L
+// expected-note@-3 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'A' for 1st argument}}
+#endif
+// expected-note@-5 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
 
 class Base { 
 public:
diff --git a/test/SemaTemplate/instantiate-expr-4.cpp b/test/SemaTemplate/instantiate-expr-4.cpp
index d95ccfecd9b58..9a1a1d2bb6970 100644
--- a/test/SemaTemplate/instantiate-expr-4.cpp
+++ b/test/SemaTemplate/instantiate-expr-4.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -fsyntax-only -verify -std=c++11 %s
 
 // ---------------------------------------------------------------------
 // C++ Functional Casts
@@ -22,6 +24,9 @@ struct FunctionalCast0 {
 template struct FunctionalCast0<5>;
 
 struct X { // expected-note 3 {{candidate constructor (the implicit copy constructor)}}
+#if __cplusplus >= 201103L
+// expected-note@-2 3 {{candidate constructor (the implicit move constructor) not viable}}
+#endif
   X(int, int); // expected-note 3 {{candidate constructor}}
 };
 
@@ -213,6 +218,10 @@ template<typename T, typename Val1>
 struct InitList1 {
   void f(Val1 val1) { 
     T x = { val1 };
+#if __cplusplus >= 201103L
+    // expected-error@-2 {{type 'float' cannot be narrowed to 'int' in initializer list}}
+    // expected-note@-3 {{insert an explicit cast to silence this issue}}
+#endif
   }
 };
 
@@ -222,6 +231,9 @@ struct APair {
 };
 
 template struct InitList1<int[1], float>;
+#if __cplusplus >= 201103L
+// expected-note@-2 {{instantiation of member function}}
+#endif
 template struct InitList1<APair, int*>;
 
 template<typename T, typename Val1, typename Val2>
diff --git a/test/SemaTemplate/instantiate-member-class.cpp b/test/SemaTemplate/instantiate-member-class.cpp
index 3f49606b86e55..159bccb2c9929 100644
--- a/test/SemaTemplate/instantiate-member-class.cpp
+++ b/test/SemaTemplate/instantiate-member-class.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 namespace PR8965 {
   template<typename T>
@@ -106,7 +108,10 @@ namespace test2 {
 namespace AliasTagDef {
   template<typename T>
   struct F {
-    using S = struct U { // expected-warning {{C++11}}
+    using S = struct U {
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{alias declarations are a C++11 extension}}
+#endif
       T g() {
         return T();
       }
@@ -122,8 +127,13 @@ namespace rdar10397846 {
   {
     struct B
     {
-      struct C { C() { int *ptr = I; } }; // expected-error{{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}} \
-                                             expected-warning{{expression which evaluates to zero treated as a null pointer constant of type 'int *'}}
+      struct C { C() { int *ptr = I; } };
+#if __cplusplus >= 201103L
+      // expected-error@-2 {{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}}
+#else
+      // expected-warning@-4 {{expression which evaluates to zero treated as a null pointer constant of type 'int *'}}
+#endif
+      // expected-error@-6 {{cannot initialize a variable of type 'int *' with an rvalue of type 'int'}}
     };
   };
 
diff --git a/test/SemaTemplate/instantiate-sizeof.cpp b/test/SemaTemplate/instantiate-sizeof.cpp
index bf66fdc17c657..660e70549e30f 100644
--- a/test/SemaTemplate/instantiate-sizeof.cpp
+++ b/test/SemaTemplate/instantiate-sizeof.cpp
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
-// expected-no-diagnostics
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -fsyntax-only -verify -std=c++11 %s
 
 // Make sure we handle contexts correctly with sizeof
 template<typename T> void f(T n) {
@@ -9,3 +8,29 @@ template<typename T> void f(T n) {
 int main() {
   f<int>(1);
 }
+
+// Make sure we handle references to non-static data members in unevaluated
+// contexts in class template methods correctly. Previously we assumed these
+// would be valid MemberRefExprs, but they have no 'this' so we need to form a
+// DeclRefExpr to the FieldDecl instead.
+// PR26893
+template <class T>
+struct M {
+  M() {}; // expected-note {{in instantiation of default member initializer 'M<S>::m' requested here}}
+  int m = *T::x; // expected-error {{invalid use of non-static data member 'x'}}
+  void f() {
+    // These are valid.
+    static_assert(sizeof(T::x) == 8, "ptr");
+    static_assert(sizeof(*T::x) == 4, "int");
+  }
+};
+struct S { int *x; };
+template struct M<S>; // expected-note {{in instantiation of member function 'M<S>::M' requested here}}
+
+// Similar test case for PR26893.
+template <typename T=void>
+struct bar {
+  struct foo { int array[10]; };
+  int baz() { return sizeof(foo::array); }
+};
+template struct bar<>;
diff --git a/test/SemaTemplate/member-access-expr.cpp b/test/SemaTemplate/member-access-expr.cpp
index f1aa30ec32a6f..8dba2e68d6562 100644
--- a/test/SemaTemplate/member-access-expr.cpp
+++ b/test/SemaTemplate/member-access-expr.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<typename T>
 void call_f0(T x) {
   x.Base::f0();
@@ -28,15 +31,25 @@ void test_f0_through_typedef(X0 x0) {
 
 template<typename TheBase, typename T>
 void call_f0_through_typedef2(T x) {
-  typedef TheBase CrazyBase; // expected-note{{current scope}}
-  x.CrazyBase::f0(); // expected-error{{ambiguous}} \
-                     // expected-error 2{{no member named}}
+  typedef TheBase CrazyBase;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup from the current scope refers here}}
+#endif
+
+  x.CrazyBase::f0(); // expected-error 2{{no member named}}
+#if __cplusplus <= 199711L
+  // expected-error@-2 {{lookup of 'CrazyBase' in member access expression is ambiguous}}
+#endif
+
 }
 
 struct OtherBase { };
 
 struct X1 : Base, OtherBase { 
-  typedef OtherBase CrazyBase; // expected-note{{object type}}
+  typedef OtherBase CrazyBase;
+#if __cplusplus <= 199711L
+  // expected-note@-2 {{lookup in the object type 'X1' refers here}}
+#endif
 };
 
 void test_f0_through_typedef2(X0 x0, X1 x1) {
diff --git a/test/SemaTemplate/ms-delayed-default-template-args.cpp b/test/SemaTemplate/ms-delayed-default-template-args.cpp
index ca9ddb0d9d158..0c05469424758 100644
--- a/test/SemaTemplate/ms-delayed-default-template-args.cpp
+++ b/test/SemaTemplate/ms-delayed-default-template-args.cpp
@@ -55,6 +55,15 @@ struct Foo {
 typedef int Weber;
 }
 
+// MSVC accepts this, but Clang doesn't.
+namespace test_scope_spec {
+template <typename T = ns::Bar>  // expected-error {{use of undeclared identifier 'ns'}}
+struct Foo {
+  static_assert(sizeof(T) == 4, "Bar should have gotten int");
+};
+namespace ns { typedef int Bar; }
+}
+
 #ifdef __clang__
 // These are negative test cases that MSVC doesn't compile either.  Try to use
 // unique undeclared identifiers so typo correction doesn't find types declared
diff --git a/test/SemaTemplate/ms-function-specialization-class-scope.cpp b/test/SemaTemplate/ms-function-specialization-class-scope.cpp
index 5da00837cc09e..3c7111d058389 100644
--- a/test/SemaTemplate/ms-function-specialization-class-scope.cpp
+++ b/test/SemaTemplate/ms-function-specialization-class-scope.cpp
@@ -75,3 +75,12 @@ namespace Duplicates {
   // here.
   template struct A<int>;
 }
+
+namespace PR28082 {
+struct S {
+  template <int>
+  int f(int = 0);
+  template <>
+  int f<0>(int); // expected-warning {{Microsoft extension}}
+};
+}
diff --git a/test/SemaTemplate/ms-lookup-template-base-classes.cpp b/test/SemaTemplate/ms-lookup-template-base-classes.cpp
index 4f3df277d912b..6afc7091260dc 100644
--- a/test/SemaTemplate/ms-lookup-template-base-classes.cpp
+++ b/test/SemaTemplate/ms-lookup-template-base-classes.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -std=c++1y -fms-compatibility -fno-spell-checking -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fcxx-exceptions -fexceptions -std=c++1y -fms-compatibility -fno-spell-checking -fsyntax-only -verify %s
 
 
 template <class T>
@@ -573,3 +573,62 @@ void h();
 template <typename T> decltype(h(T())) check2(); // expected-note{{candidate template ignored: substitution failure [with T = int]: no matching function for call to 'h'}}
 decltype(check2<int>()) y; // expected-error{{no matching function for call to 'check2'}}
 }
+
+// We also allow unqualified lookup into bases in contexts where the we know the
+// undeclared identifier *must* be a type, such as a new expression or catch
+// parameter type.
+template <typename T>
+struct UseUnqualifiedTypeNames : T {
+  void foo() {
+    void *P = new TheType; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    size_t x = __builtin_offsetof(TheType, f2); // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    try {
+    } catch (TheType) { // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    }
+    enum E : IntegerType { E0 = 42 }; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+    _Atomic(TheType) a; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+  }
+  void out_of_line();
+};
+template <typename T>
+void UseUnqualifiedTypeNames<T>::out_of_line() {
+  void *p = new TheType; // expected-warning {{unqualified lookup}} expected-error {{no type}}
+}
+struct Base {
+  typedef int IntegerType;
+  struct TheType {
+    int f1, f2;
+  };
+};
+template struct UseUnqualifiedTypeNames<Base>;
+struct BadBase { };
+template struct UseUnqualifiedTypeNames<BadBase>; // expected-note-re 2 {{in instantiation {{.*}} requested here}}
+
+namespace partial_template_lookup {
+
+class Bar;
+class Spare;
+
+template <class T, class X = Bar>
+class FooTemplated;
+
+class FooBase {
+public:
+  typedef int BaseTypedef;
+};
+
+// Partial template spec (unused)
+template <class T>
+class FooTemplated<T, Spare> {};
+
+// Partial template spec (used)
+template <class T>
+class FooTemplated<T, Bar> : public FooBase {};
+
+// Full template spec
+template <class T, class X>
+class FooTemplated : public FooTemplated<T, Bar> {
+public:
+  BaseTypedef Member; // expected-warning {{unqualified lookup}}
+};
+}
diff --git a/test/SemaTemplate/recovery-crash.cpp b/test/SemaTemplate/recovery-crash.cpp
index 02f80495bb941..c8e783f47b458 100644
--- a/test/SemaTemplate/recovery-crash.cpp
+++ b/test/SemaTemplate/recovery-crash.cpp
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
 
 // Clang used to crash trying to recover while adding 'this->' before Work(x);
 
@@ -25,14 +27,20 @@ namespace PR16134 {
 
 namespace PR16225 {
   template <typename T> void f();
-  template<typename C> void g(C*) {
+  template <typename C> void g(C*) {
     struct LocalStruct : UnknownBase<Mumble, C> { };  // expected-error {{unknown template name 'UnknownBase'}} \
                                                       // expected-error {{use of undeclared identifier 'Mumble'}}
-    f<LocalStruct>();  // expected-warning {{template argument uses local type 'LocalStruct'}}
+    f<LocalStruct>();
+#if __cplusplus <= 199711L
+    // expected-warning@-2 {{template argument uses local type 'LocalStruct'}}
+#endif
   }
   struct S;
   void h() {
-    g<S>(0);  // expected-note {{in instantiation of function template specialization}}
+    g<S>(0);
+#if __cplusplus <= 199711L
+    // expected-note@-2 {{in instantiation of function template specialization}}
+#endif
   }
 }
 
diff --git a/test/SemaTemplate/temp_arg_type.cpp b/test/SemaTemplate/temp_arg_type.cpp
index 637b5637baec5..daad61c14292c 100644
--- a/test/SemaTemplate/temp_arg_type.cpp
+++ b/test/SemaTemplate/temp_arg_type.cpp
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++98 %s
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s
+
 template<typename T> class A; // expected-note 2 {{template parameter is declared here}} expected-note{{template is declared here}}
 
 // [temp.arg.type]p1
@@ -24,11 +27,21 @@ A<ns::B> a8; // expected-error{{use of class template 'ns::B' requires template
 // [temp.arg.type]p2
 void f() {
   class X { };
-  A<X> * a = 0; // expected-warning{{template argument uses local type 'X'}}
+  A<X> * a = 0;
+#if __cplusplus <= 199711L
+  // expected-warning@-2 {{template argument uses local type 'X'}}
+#endif
 }
 
-struct { int x; } Unnamed; // expected-note{{unnamed type used in template argument was declared here}}
-A<__typeof__(Unnamed)> *a9; // expected-warning{{template argument uses unnamed type}}
+struct { int x; } Unnamed;
+#if __cplusplus <= 199711L
+// expected-note@-2 {{unnamed type used in template argument was declared here}}
+#endif
+
+A<__typeof__(Unnamed)> *a9;
+#if __cplusplus <= 199711L
+// expected-warning@-2 {{template argument uses unnamed type}}
+#endif
 
 template<typename T, unsigned N>
 struct Array {
diff --git a/test/SemaTemplate/template-id-expr.cpp b/test/SemaTemplate/template-id-expr.cpp
index 4416f92723adc..499d289ee6754 100644
--- a/test/SemaTemplate/template-id-expr.cpp
+++ b/test/SemaTemplate/template-id-expr.cpp
@@ -96,3 +96,9 @@ void f5() {
 }
 
 template void f5<0>(); // expected-note {{in instantiation of function template specialization 'f5<0>' requested here}}
+
+class C {};
+template <template <typename> class D>  // expected-note{{previous use is here}}
+class E {
+  template class D<C>;  // expected-error {{elaborated type refers to a template template argument}}
+};
diff --git a/test/SemaTemplate/undefined-template.cpp b/test/SemaTemplate/undefined-template.cpp
new file mode 100644
index 0000000000000..a03d0b7cff626
--- /dev/null
+++ b/test/SemaTemplate/undefined-template.cpp
@@ -0,0 +1,139 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++14 -Wundefined-func-template %s
+
+template <class T> struct C1 {
+  static char s_var_1;       // expected-note{{forward declaration of template entity is here}}
+  static char s_var_2;       // expected-note{{forward declaration of template entity is here}}
+  static void s_func_1();    // expected-note{{forward declaration of template entity is here}}
+  static void s_func_2();    // expected-note{{forward declaration of template entity is here}}
+  void meth_1();             // expected-note2{{forward declaration of template entity is here}}
+  void meth_2();
+  template <class T1> static char s_tvar_2;      // expected-note{{forward declaration of template entity is here}}
+  template <class T1> static void s_tfunc_2();   // expected-note{{forward declaration of template entity is here}}
+  template<typename T1> struct C2 {
+    static char s_var_2;     // expected-note{{forward declaration of template entity is here}}
+    static void s_func_2();  // expected-note{{forward declaration of template entity is here}}
+    void meth_2();           // expected-note{{forward declaration of template entity is here}}
+    template <class T2> static char s_tvar_2;    // expected-note{{forward declaration of template entity is here}}
+    template <class T2> void tmeth_2();          // expected-note{{forward declaration of template entity is here}}
+  };
+};
+
+extern template char C1<int>::s_var_2;
+extern template void C1<int>::s_func_2();
+extern template void C1<int>::meth_2();
+extern template char C1<int>::s_tvar_2<char>;
+extern template void C1<int>::s_tfunc_2<char>();
+extern template void C1<int>::C2<long>::s_var_2;
+extern template void C1<int>::C2<long>::s_func_2();
+extern template void C1<int>::C2<long>::meth_2();
+extern template char C1<int>::C2<long>::s_tvar_2<char>;
+extern template void C1<int>::C2<long>::tmeth_2<char>();
+
+char func_01() {
+  return C1<int>::s_var_2;
+}
+
+char func_02() {
+  return C1<int>::s_var_1; // expected-warning{{instantiation of variable 'C1<int>::s_var_1' required here, but no definition is available}}
+                           // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_var_1' is explicitly instantiated in another translation unit}}
+}
+
+char func_03() {
+  return C1<char>::s_var_2; // expected-warning{{instantiation of variable 'C1<char>::s_var_2' required here, but no definition is available}}
+                            // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::s_var_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_04() {
+  C1<int>::s_func_1(); // expected-warning{{instantiation of function 'C1<int>::s_func_1' required here, but no definition is available}}
+                       // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_func_1' is explicitly instantiated in another translation unit}}
+}
+
+void func_05() {
+  C1<int>::s_func_2();
+}
+
+void func_06() {
+  C1<char>::s_func_2(); // expected-warning{{instantiation of function 'C1<char>::s_func_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::s_func_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_07(C1<int> *x) {
+  x->meth_1();  // expected-warning{{instantiation of function 'C1<int>::meth_1' required here, but no definition is available}}
+                // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::meth_1' is explicitly instantiated in another translation unit}}
+}
+
+void func_08(C1<int> *x) {
+  x->meth_2();
+}
+
+void func_09(C1<char> *x) {
+  x->meth_1();  // expected-warning{{instantiation of function 'C1<char>::meth_1' required here, but no definition is available}}
+                // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<char>::meth_1' is explicitly instantiated in another translation unit}}
+}
+
+char func_10() {
+  return C1<int>::s_tvar_2<char>;
+}
+
+char func_11() {
+  return C1<int>::s_tvar_2<long>; // expected-warning{{instantiation of variable 'C1<int>::s_tvar_2<long>' required here, but no definition is available}}
+                                  // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_tvar_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+void func_12() {
+  C1<int>::s_tfunc_2<char>();
+}
+
+void func_13() {
+  C1<int>::s_tfunc_2<long>(); // expected-warning{{instantiation of function 'C1<int>::s_tfunc_2<long>' required here, but no definition is available}}
+                              // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::s_tfunc_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+char func_14() {
+  return C1<int>::C2<long>::s_var_2;
+}
+
+char func_15() {
+  return C1<int>::C2<char>::s_var_2;  //expected-warning {{instantiation of variable 'C1<int>::C2<char>::s_var_2' required here, but no definition is available}}
+                                      // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::s_var_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_16() {
+  C1<int>::C2<long>::s_func_2();
+}
+
+void func_17() {
+  C1<int>::C2<char>::s_func_2(); // expected-warning{{instantiation of function 'C1<int>::C2<char>::s_func_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::s_func_2' is explicitly instantiated in another translation unit}}
+}
+
+void func_18(C1<int>::C2<long> *x) {
+  x->meth_2();
+}
+
+void func_19(C1<int>::C2<char> *x) {
+  x->meth_2();   // expected-warning{{instantiation of function 'C1<int>::C2<char>::meth_2' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<char>::meth_2' is explicitly instantiated in another translation unit}}
+}
+
+char func_20() {
+  return C1<int>::C2<long>::s_tvar_2<char>;
+}
+
+char func_21() {
+  return C1<int>::C2<long>::s_tvar_2<long>; // expected-warning{{instantiation of variable 'C1<int>::C2<long>::s_tvar_2<long>' required here, but no definition is available}}
+                                  // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<long>::s_tvar_2<long>' is explicitly instantiated in another translation unit}}
+}
+
+void func_22(C1<int>::C2<long> *x) {
+  x->tmeth_2<char>();
+}
+
+void func_23(C1<int>::C2<long> *x) {
+  x->tmeth_2<int>();    // expected-warning{{instantiation of function 'C1<int>::C2<long>::tmeth_2<int>' required here, but no definition is available}}
+                        // expected-note@-1{{add an explicit instantiation declaration to suppress this warning if 'C1<int>::C2<long>::tmeth_2<int>' is explicitly instantiated in another translation unit}}
+}
+
+int main() {
+  return 0;
+}
diff --git a/test/Unit/lit.site.cfg.in b/test/Unit/lit.site.cfg.in
index 37e8cb011ebb8..c2f81463f239d 100644
--- a/test/Unit/lit.site.cfg.in
+++ b/test/Unit/lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
diff --git a/test/lit.cfg b/test/lit.cfg
index c6026506660c9..e7ce8fac7c1b1 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -44,7 +44,7 @@ else:
 config.test_format = lit.formats.ShTest(execute_external)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap']
+config.suffixes = ['.c', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs']
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
@@ -397,10 +397,6 @@ if is_filesystem_case_insensitive():
 if os.path.exists("/dev/fd/0") and sys.platform not in ['cygwin']:
     config.available_features.add('dev-fd-fs')
 
-# DW2 Target
-if not re.match(r'.*-win32$', config.target_triple):
-    config.available_features.add('dw2')
-
 # Not set on native MS environment.
 if not re.match(r'.*-win32$', config.target_triple):
     config.available_features.add('non-ms-sdk')
@@ -467,6 +463,11 @@ else:
 if config.enable_backtrace == "1":
     config.available_features.add("backtrace")
 
+if config.have_zlib == "1":
+    config.available_features.add("zlib")
+else:
+    config.available_features.add("nozlib")
+
 # Check if we should run long running tests.
 if lit_config.params.get("run_long_tests", None) == "true":
     config.available_features.add("long_tests")
@@ -491,4 +492,9 @@ gmalloc_path_str = lit_config.params.get('gmalloc_path',
 if use_gmalloc:
      config.environment.update({'DYLD_INSERT_LIBRARIES' : gmalloc_path_str})
 
+# Check if we should allow outputs to console.
+run_console_tests = int(lit_config.params.get('enable_console', '0'))
+if run_console_tests != 0:
+  config.available_features.add('console')
+
 lit.util.usePlatformSdkOnDarwin(config, lit_config)
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 332bcec14874e..f368c995f568c 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -1,7 +1,7 @@
+@LIT_SITE_CFG_IN_HEADER@
+
 import sys
 
-## Autogenerated by LLVM/Clang configuration.
-# Do not edit!
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
@@ -14,6 +14,7 @@ config.clang_tools_dir = "@CLANG_TOOLS_DIR@"
 config.host_triple = "@LLVM_HOST_TRIPLE@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
+config.have_zlib = "@HAVE_LIBZ@"
 config.clang_arcmt = @ENABLE_CLANG_ARCMT@
 config.clang_staticanalyzer = @ENABLE_CLANG_STATIC_ANALYZER@
 config.clang_examples = @ENABLE_CLANG_EXAMPLES@